diff --git a/.github/workflows/run-schedulers b/.github/workflows/run-schedulers new file mode 100755 index 0000000000000..0aa3ffea3bff7 --- /dev/null +++ b/.github/workflows/run-schedulers @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Run sched-ext scheduler for TIMEOUT seconds inside virtme-ng and catch +# potential errors, then unload the scheduler and return the exit status. + +# Maximum time for each scheduler run. +TEST_TIMEOUT=30 + +# Maximum timeout for the guest used for each scheduler run (this is used to +# hard-shutdown the guest in case of system hangs). +GUEST_TIMEOUT=60 + +# Check if virtme-ng is available. +if [ ! -x `which vng` ]; then + echo "vng not found, please install virtme-ng to enable testing" + exit 1 +fi + +# Test all the available schedulers. +# +# NOTE: virtme-ng automatically runs the kernel from the current working +# directory by default. +# +# Each scheduler will be tested in a separate instance booted from scratch, to +# ensure that each run does not impact the others. +# +# TODO: exclude scx_layered for now, because it requires a special config +# file, otherwise its test would fail with "Error: No layer spec". +# +# Maybe in the future change scx_layered to run with a default layer spec, just +# for testing it. +# +for sched in $(find tools/sched_ext/build/bin -type f -executable | grep -v scx_layered); do + rm -f /tmp/output + (timeout --foreground --preserve-status ${GUEST_TIMEOUT} \ + vng --force-9p --disable-microvm --verbose -- \ + "timeout --foreground --preserve-status ${TEST_TIMEOUT} ${sched}" \ + 2>&1 type; + } + + SEC(".struct_ops") + struct sched_ext_ops simple_ops = { + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple", + }; + +Dispatch Queues +--------------- + +To match the impedance between the scheduler core and the BPF scheduler, +sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a +priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), +and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage +an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and +``scx_bpf_destroy_dsq()``. + +A CPU always executes a task from its local DSQ. A task is "dispatched" to a +DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's +local DSQ. + +When a CPU is looking for the next task to run, if the local DSQ is not +empty, the first task is picked. Otherwise, the CPU tries to consume the +global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` +is invoked. + +Scheduling Cycle +---------------- + +The following briefly shows how a waking task is scheduled and executed. + +1. When a task is waking up, ``ops.select_cpu()`` is the first operation + invoked. This serves two purposes. First, CPU selection optimization + hint. Second, waking up the selected CPU if idle. + + The CPU selected by ``ops.select_cpu()`` is an optimization hint and not + binding. The actual decision is made at the last step of scheduling. + However, there is a small performance gain if the CPU + ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. + + A side-effect of selecting a CPU is waking it up from idle. While a BPF + scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, + using ``ops.select_cpu()`` judiciously can be simpler and more efficient. + + A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by + calling ``scx_bpf_dispatch()``. If the task is dispatched to + ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the + local DSQ of whichever CPU is returned from ``ops.select_cpu()``. + Additionally, dispatching directly from ``ops.select_cpu()`` will cause the + ``ops.enqueue()`` callback to be skipped. + + Note that the scheduler core will ignore an invalid CPU selection, for + example, if it's outside the allowed cpumask of the task. + +2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the + task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()`` + can make one of the following decisions: + + * Immediately dispatch the task to either the global or local DSQ by + calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or + ``SCX_DSQ_LOCAL``, respectively. + + * Immediately dispatch the task to a custom DSQ by calling + ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. + + * Queue the task on the BPF side. + +3. When a CPU is ready to schedule, it first looks at its local DSQ. If + empty, it then looks at the global DSQ. If there still isn't a task to + run, ``ops.dispatch()`` is invoked which can use the following two + functions to populate the local DSQ. + + * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can + be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, + ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` + currently can't be called with BPF locks held, this is being worked on + and will be supported. ``scx_bpf_dispatch()`` schedules dispatching + rather than performing them immediately. There can be up to + ``ops.dispatch_max_batch`` pending tasks. + + * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ + to the dispatching DSQ. This function cannot be called with any BPF + locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks + before trying to consume the specified DSQ. + +4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, + the CPU runs the first one. If empty, the following steps are taken: + + * Try to consume the global DSQ. If successful, run the task. + + * If ``ops.dispatch()`` has dispatched any tasks, retry #3. + + * If the previous task is an SCX task and still runnable, keep executing + it (see ``SCX_OPS_ENQ_LAST``). + + * Go idle. + +Note that the BPF scheduler can always choose to dispatch tasks immediately +in ``ops.enqueue()`` as illustrated in the above simple example. If only the +built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as +a task is never queued on the BPF scheduler and both the local and global +DSQs are consumed automatically. + +``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use +``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as +``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue +dispatching, and must be dispatched to with ``scx_bpf_dispatch()``. See the +function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for +more information. + +Where to Look +============= + +* ``include/linux/sched/ext.h`` defines the core data structures, ops table + and constants. + +* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. + The functions prefixed with ``scx_bpf_`` can be called from the BPF + scheduler. + +* ``tools/sched_ext/`` hosts example BPF scheduler implementations. + + * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a + custom DSQ. + + * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five + levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. + +ABI Instability +=============== + +The APIs provided by sched_ext to BPF schedulers programs have no stability +guarantees. This includes the ops table callbacks and constants defined in +``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in +``kernel/sched/ext.c``. + +While we will attempt to provide a relatively stable API surface when +possible, they are subject to change without warning between kernel +versions. diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a692..91645a4518c8a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19461,6 +19461,8 @@ R: Ben Segall (CONFIG_CFS_BANDWIDTH) R: Mel Gorman (CONFIG_NUMA_BALANCING) R: Daniel Bristot de Oliveira (SCHED_DEADLINE) R: Valentin Schneider (TOPOLOGY) +R: Tejun Heo (SCHED_EXT) +R: David Vernet (SCHED_EXT) L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core @@ -19469,6 +19471,7 @@ F: include/linux/sched.h F: include/linux/wait.h F: include/uapi/linux/sched.h F: kernel/sched/ +F: tools/sched_ext/ SCSI LIBSAS SUBSYSTEM R: John Garry diff --git a/Makefile b/Makefile index 9869f57c3fb3e..af261e2c939ba 100644 --- a/Makefile +++ b/Makefile @@ -1342,6 +1342,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),) $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean endif +tools-clean-targets := sched_ext +PHONY += $(tools-clean-targets) +$(tools-clean-targets): + $(Q)$(MAKE) -sC tools $@_clean +tools_clean: $(tools-clean-targets) + # Clear a bunch of variables before executing the submake ifeq ($(quiet),silent_) tools_silent=s @@ -1511,7 +1517,7 @@ PHONY += $(mrproper-dirs) mrproper $(mrproper-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) -mrproper: clean $(mrproper-dirs) +mrproper: clean $(mrproper-dirs) tools_clean $(call cmd,rmfiles) @find . $(RCS_FIND_IGNORE) \ \( -name '*.rmeta' \) \ diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 02217e3c916b5..1ce3535cba6de 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -520,6 +520,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { NULL, /* P */ NULL, /* Q */ NULL, /* R */ + /* S: May be registered by sched_ext for resetting */ NULL, /* S */ NULL, /* T */ NULL, /* U */ diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index bce1d7ac95caa..a62960fdf73f0 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -27,7 +27,7 @@ static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ -static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ +static DEFINE_RAW_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -539,6 +539,7 @@ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; + unsigned long flags; if (!kn || !atomic_dec_and_test(&kn->count)) return; @@ -563,9 +564,9 @@ void kernfs_put(struct kernfs_node *kn) simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, flags); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags); kmem_cache_free(kernfs_node_cache, kn); kn = parent; @@ -607,6 +608,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *kn; u32 id_highbits; int ret; + unsigned long irqflags; name = kstrdup_const(name, GFP_KERNEL); if (!name) @@ -617,13 +619,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, goto err_out1; idr_preload(GFP_KERNEL); - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags); idr_preload_end(); if (ret < 0) goto err_out2; @@ -659,9 +661,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -714,8 +716,9 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); + unsigned long flags; - spin_lock(&kernfs_idr_lock); + raw_spin_lock_irqsave(&kernfs_idr_lock, flags); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) @@ -739,10 +742,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags); return kn; err_unlock: - spin_unlock(&kernfs_idr_lock); + raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags); return NULL; } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5dd3a61d673d4..b84816895440f 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -131,6 +131,7 @@ *(__dl_sched_class) \ *(__rt_sched_class) \ *(__fair_sched_class) \ + *(__ext_sched_class) \ *(__idle_sched_class) \ __sched_class_lowest = .; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861cd369..bfc027311950e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -132,12 +132,18 @@ enum { CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ + CFTYPE_HIDDEN = (1 << 6), /* file type hidden, see cgroup_show_cftypes() */ + /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ __CFTYPE_ADDED = (1 << 18), }; +enum cfile_flags { + CFILE_HIDDEN = (1 << 0), /* file instance hidden */ +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can @@ -145,7 +151,9 @@ enum { */ struct cgroup_file { /* do not access any fields from outside cgroup core */ + struct cftype *cft; struct kernfs_node *kn; + unsigned int flags; unsigned long notified_at; struct timer_list notify_timer; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 34aaf0e87def8..32679fcff0a72 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -29,8 +29,6 @@ struct kernel_clone_args; -#ifdef CONFIG_CGROUPS - /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of @@ -40,6 +38,8 @@ struct kernel_clone_args; #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 +#ifdef CONFIG_CGROUPS + enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ @@ -114,6 +114,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); +void cgroup_show_cftype(struct cftype *cft, bool show); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); diff --git a/include/linux/sched.h b/include/linux/sched.h index cdb8ea53c365b..b8a02b4cf3667 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -80,6 +80,8 @@ struct task_group; struct task_struct; struct user_event_mm; +#include + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -798,6 +800,9 @@ struct task_struct { struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; +#endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 index 0000000000000..ae552129931a9 --- /dev/null +++ b/include/linux/sched/ext.h @@ -0,0 +1,749 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef _LINUX_SCHED_EXT_H +#define _LINUX_SCHED_EXT_H + +#ifdef CONFIG_SCHED_CLASS_EXT + +#include +#include + +struct cgroup; + +enum scx_consts { + SCX_OPS_NAME_LEN = 128, + SCX_EXIT_REASON_LEN = 128, + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + + SCX_SLICE_DFL = 20 * NSEC_PER_MSEC, + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ +}; + +/* + * DSQ (dispatch queue) IDs are 64bit of the format: + * + * Bits: [63] [62 .. 0] + * [ B] [ ID ] + * + * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs + * ID: 63 bit ID + * + * Built-in IDs: + * + * Bits: [63] [62] [61..32] [31 .. 0] + * [ 1] [ L] [ R ] [ V ] + * + * 1: 1 for built-in DSQs. + * L: 1 for LOCAL_ON DSQ IDs, 0 for others + * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. + */ +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, + SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, + + SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, + SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, + SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, + SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* BPF unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + /* textual representation of the above */ + char reason[SCX_EXIT_REASON_LEN]; + /* number of entries in the backtrace */ + u32 bt_len; + /* backtrace if exiting due to an error */ + unsigned long bt[SCX_EXIT_BT_LEN]; + /* extra message */ + char msg[SCX_EXIT_MSG_LEN]; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * CPU cgroup knob enable flags + */ + SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16, /* cpu.weight */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_CGROUP_KNOB_WEIGHT, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * Userland can implement an arbitrary scheduling policy by implementing and + * loading operations in this table. + */ +struct sched_ext_ops { + /** + * select_cpu - Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p + * is dispatched, the ops.enqueue() callback will be skipped. Finally, + * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the + * local DSQ of whatever CPU is returned by this callback. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * enqueue - Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() + * or enqueue on the BPF scheduler. If not directly dispatched, the bpf + * scheduler owns @p and if it fails to dispatch @p, the task will + * stall. + * + * If @p was dispatched from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * dequeue - Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using + * scx_bpf_consume(). + * + * The maximum number of times scx_bpf_dispatch() can be called without + * an intervening scx_bpf_consume() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * runnable - A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU. Likewise, a task + * may be ->enqueue()'d without being preceded by this operation e.g. + * after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * running - A task is starting to run on its associated CPU + * @p: task starting to run + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * stopping - A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * quiescent - A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * yield - Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * core_sched_before - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a,struct task_struct *b); + + /** + * set_weight - Set task weight + * @p: task to set weight for + * @weight: new eight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * set_cpumask - Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * update_idle - Update the idle state of a CPU + * @cpu: CPU to udpate the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * cpu_acquire - A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * cpu_release - A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * init_task - Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, + struct scx_init_task_args *args); + + /** + * exit_task - Exit a previously-running task from the system + * @p: task to exit + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, + struct scx_exit_task_args *args); + + /** + * enable - Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. @p is now in the cgroup specified in + * @args. enable() is called on @p any time it enters SCX, and is + * always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * disable - Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * cgroup_init - Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * cgroup_exit - Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * cgroup_prep_move - Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_move - Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_cancel_move - Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * cgroup_set_weight - A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @tg's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +#endif /* CONFIG_CGROUPS */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * cpu_online - A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * cpu_offline - A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks + * associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * init - Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * exit - Clean up after the BPF scheduler + * @info: Exit info + */ + void (*exit)(struct scx_exit_info *info); + + /** + * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * flags - %SCX_OPS_* flags + */ + u64 flags; + + /** + * timeout_ms - The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * name - BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; +}; + +/* + * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the + * scheduler core and the BPF scheduler. See the documentation for more details. + */ +struct scx_dispatch_q { + raw_spinlock_t lock; + struct list_head fifo; /* processed in dispatching order */ + struct rb_root_cached priq; /* processed in p->scx.dsq_vtime order */ + u32 nr; + u64 id; + struct rhash_head hash_node; + struct llist_node free_node; + struct rcu_head rcu; +}; + +/* scx_entity.flags */ +enum scx_ent_flags { + SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ + SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ + SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + + SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, + + SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ +}; + +/* scx_entity.flags & SCX_TASK_STATE_MASK */ +enum scx_task_state { + SCX_TASK_NONE, /* ops.init_task() not called yet */ + SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ + SCX_TASK_READY, /* fully initialized, but not in sched_ext */ + SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + + SCX_TASK_NR_STATES, +}; + +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + +/* + * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from + * everywhere and the following bits track which kfunc sets are currently + * allowed for %current. This simple per-task tracking works because SCX ops + * nest in a limited way. BPF will likely implement a way to allow and disallow + * kfuncs depending on the calling context which will replace this manual + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ + /* all non-sleepables may be nested inside INIT and SLEEPABLE */ + SCX_KF_INIT = 1 << 0, /* running ops.init() */ + SCX_KF_SLEEPABLE = 1 << 1, /* other sleepable init operations */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 2, /* ops.cpu_release() */ + /* ops.dequeue (in REST) may be nested inside DISPATCH */ + SCX_KF_DISPATCH = 1 << 3, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 4, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 5, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 6, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + +/* + * The following is embedded in task_struct and contains all fields necessary + * for a task to be scheduled by SCX. + */ +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct { + struct list_head fifo; /* dispatch order */ + struct rb_node priq; /* p->scx.dsq_vtime order */ + } dsq_node; + u32 flags; /* protected by rq lock */ + u32 dsq_flags; /* protected by dsq lock */ + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + u32 kf_mask; /* see scx_kf_mask above */ + struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ + atomic_long_t ops_state; + + struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; + +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + + /* BPF scheduler modifiable fields */ + + /* + * Runtime budget in nsecs. This is usually set through + * scx_bpf_dispatch() but can also be modified directly by the BPF + * scheduler. Automatically decreased by SCX as the task executes. On + * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. + */ + u64 slice; + + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() + * but can also be modified directly by the BPF scheduler. Modifying it + * while a task is queued on a dsq may mangle the ordering and is not + * recommended. + */ + u64 dsq_vtime; + + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * If set from ops.init_task() and the task's policy is already + * %SCHED_EXT, which can happen while the BPF scheduler is being loaded + * or by inhering the parent's policy during fork, the task's policy is + * rejected and forcefully reverted to %SCHED_NORMAL. The number of + * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ + struct list_head tasks_node; +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif +}; + +void sched_ext_free(struct task_struct *p); +void print_scx_info(const char *log_lvl, struct task_struct *p); + +#else /* !CONFIG_SCHED_CLASS_EXT */ + +static inline void sched_ext_free(struct task_struct *p) {} +static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index d362aacf9f897..4df2f90555879 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab26..359a14cc76a40 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -118,6 +118,7 @@ struct clone_args { /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 #define SCHED_DEADLINE 6 +#define SCHED_EXT 7 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 diff --git a/init/Kconfig b/init/Kconfig index 8df18f3a97484..903f2e08a784b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1012,6 +1012,11 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.rst for more information. +config EXT_GROUP_SCHED + bool + depends on SCHED_CLASS_EXT && CGROUP_SCHED + default y + endif #CGROUP_SCHED config SCHED_MM_CID diff --git a/init/init_task.c b/init/init_task.c index 7ecb458eb3da6..10a1df7dfb7fa 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,20 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #endif #ifdef CONFIG_CGROUP_SCHED .sched_task_group = &root_task_group, +#endif +#ifdef CONFIG_SCHED_CLASS_EXT + .scx = { + .dsq_node.fifo = LIST_HEAD_INIT(init_task.scx.dsq_node.fifo), + .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), + .flags = 0, + .sticky_cpu = -1, + .holding_cpu = -1, + .ops_state = ATOMIC_INIT(0), + .runnable_at = INITIAL_JIFFIES, + .slice = SCX_SLICE_DFL, + .ddsp_dsq_id = SCX_DSQ_INVALID, + .ddsp_enq_flags = 0, + }, #endif .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c2f1fd95a8214..bae49b743834b 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -133,4 +133,26 @@ config SCHED_CORE which is the likely usage by Linux distributions, there should be no measurable impact on performance. - +config SCHED_CLASS_EXT + bool "Extensible Scheduling Class" + depends on BPF_SYSCALL && BPF_JIT + help + This option enables a new scheduler class sched_ext (SCX), which + allows scheduling policies to be implemented as BPF programs to + achieve the following: + + - Ease of experimentation and exploration: Enabling rapid + iteration of new scheduling policies. + - Customization: Building application-specific schedulers which + implement policies that are not applicable to general-purpose + schedulers. + - Rapid scheduler deployments: Non-disruptive swap outs of + scheduling policies in production environments. + + sched_ext leverages BPF’s struct_ops feature to define a structure + which exports function callbacks and flags to BPF programs that + wish to implement scheduling policies. The struct_ops structure + exported by sched_ext is struct sched_ext_ops, and is conceptually + similar to struct sched_class. + + See Documentation/scheduler/sched-ext.rst for more details. diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h index 5678a9ddf8178..3618769d853d0 100644 --- a/kernel/bpf/bpf_struct_ops_types.h +++ b/kernel/bpf/bpf_struct_ops_types.h @@ -9,4 +9,8 @@ BPF_STRUCT_OPS_TYPE(bpf_dummy_ops) #include BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) #endif +#ifdef CONFIG_SCHED_CLASS_EXT +#include +BPF_STRUCT_OPS_TYPE(sched_ext_ops) +#endif #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c851cf..b316d3b7ffe7a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4206,10 +4206,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (IS_ERR(kn)) return PTR_ERR(kn); + kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN)); + if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); + cfile->cft = cft; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; @@ -4485,6 +4488,24 @@ void cgroup_file_notify(struct cgroup_file *cfile) spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + return kn; +} + +static bool cfile_visible(struct cgroup_file *cfile) +{ + return !(cfile->cft->flags & CFTYPE_HIDDEN) && + !(cfile->flags & CFILE_HIDDEN); +} + /** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset @@ -4494,15 +4515,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); - kn = cfile->kn; - kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + mutex_lock(&cgroup_mutex); - if (kn) - kernfs_show(kn, show); + if (show) + cfile->flags &= ~CFILE_HIDDEN; + else + cfile->flags |= CFILE_HIDDEN; - kernfs_put(kn); + kn = cfile_kn_get(cfile); + if (kn) { + kernfs_show(kn, cfile_visible(cfile)); + kernfs_put(kn); + } + + mutex_unlock(&cgroup_mutex); } /** @@ -5526,6 +5552,63 @@ static void offline_css(struct cgroup_subsys_state *css) wake_up_all(&css->cgroup->offline_waitq); } +/** + * cgroup_show_cftype - show or hide a cgroup file type + * @cft: cftype to show or hide + * @show: whether to show or hide + * + * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show. + * @cft may or may not be added at the time of this call. After hiding, it's + * guaranteed that there are no in-flight operations on the hidden files. + */ +void cgroup_show_cftype(struct cftype *cft, bool show) +{ + struct cgroup_subsys *ss = cft->ss; + struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp; + struct cgroup_subsys_state *css; + + mutex_lock(&cgroup_mutex); + + if (show) + cft->flags &= ~CFTYPE_HIDDEN; + else + cft->flags |= CFTYPE_HIDDEN; + + if (!(cft->flags & __CFTYPE_ADDED)) + goto out_unlock; + + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { + struct cgroup *cgrp = css->cgroup; + struct kernfs_node *kn; + + if (!(css->flags & CSS_VISIBLE)) + continue; + + if (cft->file_offset) { + struct cgroup_file *cfile = + (void *)css + cft->file_offset; + + kn = cfile_kn_get(cfile); + if (kn) { + kernfs_show(kn, cfile_visible(cfile)); + kernfs_put(kn); + } + } else { + char buf[CGROUP_FILE_NAME_MAX]; + + kn = kernfs_find_and_get(cgrp->kn, + cgroup_file_name(cgrp, cft, buf)); + if (kn) { + kernfs_show(kn, show); + kernfs_put(kn); + } + } + } + +out_unlock: + mutex_unlock(&cgroup_mutex); +} + /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with diff --git a/kernel/fork.c b/kernel/fork.c index 47ff3b35352e0..92078466e67e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -968,6 +969,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); @@ -2469,7 +2471,7 @@ __latent_entropy struct task_struct *copy_process( retval = perf_event_init_task(p, clone_flags); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_sched_cancel_fork; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; @@ -2600,7 +2602,9 @@ __latent_entropy struct task_struct *copy_process( * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ - sched_cgroup_fork(p, args); + retval = sched_cgroup_fork(p, args); + if (retval) + goto bad_fork_cancel_cgroup; /* * From this point on we must avoid any synchronous user-space @@ -2646,13 +2650,13 @@ __latent_entropy struct task_struct *copy_process( /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* Let kill terminate clone/fork in the middle */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* No more failure paths after this point. */ @@ -2726,10 +2730,11 @@ __latent_entropy struct task_struct *copy_process( return p; -bad_fork_cancel_cgroup: +bad_fork_core_free: sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); +bad_fork_cancel_cgroup: cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { @@ -2768,6 +2773,8 @@ __latent_entropy struct task_struct *copy_process( audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_sched_cancel_fork: + sched_cancel_fork(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index d9dc9ab3773f2..392c91667767d 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -21,13 +21,17 @@ #include #include +#include #include +#include #include #include #include #include #include #include +#include +#include #include @@ -52,3 +56,6 @@ #include "cputime.c" #include "deadline.c" +#ifdef CONFIG_SCHED_CLASS_EXT +# include "ext.c" +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc903467..a80d257b43210 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ + if (task_on_scx(p)) + return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ + + return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ } /* @@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a, if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); +#ifdef CONFIG_SCHED_CLASS_EXT + if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ + return scx_prio_less(a, b, in_fi); +#endif + return false; } @@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) + if (!scx_switched_all() && rq->nr_running > 1) + return false; + + if (scx_enabled() && !scx_can_stop_tick(rq)) return false; /* @@ -1342,8 +1353,8 @@ static void set_load_weight(struct task_struct *p, bool update_load) * SCHED_OTHER tasks have to update their load when changing their * weight */ - if (update_load && p->sched_class == &fair_sched_class) { - reweight_task(p, prio); + if (update_load && p->sched_class->reweight_task) { + p->sched_class->reweight_task(task_rq(p), p, prio); } else { load->weight = scale_load(sched_prio_to_weight[prio]); load->inv_weight = sched_prio_to_wmult[prio]; @@ -2216,6 +2227,17 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * ->switching_to() is called with the pi_lock and rq_lock held and must not + * mess with locking. + */ +void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class) +{ + if (prev_class != p->sched_class && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); +} + /* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. @@ -2223,9 +2245,9 @@ inline int task_curr(const struct task_struct *p) * this means any call to check_class_changed() must be followed by a call to * balance_callback(). */ -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) +void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) @@ -3977,6 +3999,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { + /* + * The BPF scheduler may depend on select_task_rq() being invoked during + * wakeups. In addition, @p may end up executing on a different CPU + * regardless of what happens in the wakeup path making the ttwu_queue + * optimization less meaningful. Skip if on SCX. + */ + if (task_on_scx(p)) + return false; + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -4544,6 +4575,23 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->rt.on_rq = 0; p->rt.on_list = 0; +#ifdef CONFIG_SCHED_CLASS_EXT + p->scx.dsq = NULL; + INIT_LIST_HEAD(&p->scx.dsq_node.fifo); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + INIT_LIST_HEAD(&p->scx.runnable_node); + p->scx.flags = 0; + p->scx.weight = 0; + p->scx.sticky_cpu = -1; + p->scx.holding_cpu = -1; + p->scx.kf_mask = 0; + atomic_long_set(&p->scx.ops_state, 0); + p->scx.runnable_at = INITIAL_JIFFIES; + p->scx.slice = SCX_SLICE_DFL; + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif @@ -4747,6 +4795,8 @@ late_initcall(sched_core_sysctl_init); */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { + int ret; + __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -4783,12 +4833,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (dl_prio(p->prio)) - return -EAGAIN; - else if (rt_prio(p->prio)) + scx_pre_fork(p); + + if (dl_prio(p->prio)) { + ret = -EAGAIN; + goto out_cancel; + } else if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; - else +#ifdef CONFIG_SCHED_CLASS_EXT + } else if (task_should_scx(p)) { + p->sched_class = &ext_sched_class; +#endif + } else { p->sched_class = &fair_sched_class; + } init_entity_runnable_average(&p->se); @@ -4806,9 +4864,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif return 0; + +out_cancel: + scx_cancel_fork(p); + return ret; } -void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; @@ -4835,11 +4897,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return scx_fork(p); +} + +void sched_cancel_fork(struct task_struct *p) +{ + scx_cancel_fork(p); } void sched_post_fork(struct task_struct *p) { uclamp_post_fork(p); + scx_post_fork(p); } unsigned long to_ratio(u64 period, u64 runtime) @@ -5684,14 +5754,17 @@ void scheduler_tick(void) if (sched_feat(LATENCY_WARN) && resched_latency) resched_latency_warn(cpu, resched_latency); + scx_notify_sched_tick(); perf_event_task_tick(); if (curr->flags & PF_WQ_WORKER) wq_worker_tick(curr); #ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + if (!scx_switched_all()) { + rq->idle_balance = idle_cpu(cpu); + trigger_load_balance(rq); + } #endif } @@ -5991,7 +6064,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { + for_balance_class_range(class, prev->sched_class, &idle_sched_class) { if (class->balance(rq, prev, rf)) break; } @@ -6009,6 +6082,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; + if (scx_enabled()) + goto restart; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a @@ -6049,10 +6125,12 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (prev->dl_server) prev->dl_server = NULL; - for_each_class(class) { + for_each_active_class(class) { p = class->pick_next_task(rq); - if (p) + if (p) { + scx_notify_pick_next_task(rq, p, class); return p; + } } BUG(); /* The idle class should always have a runnable task. */ @@ -6082,7 +6160,7 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; - for_each_class(class) { + for_each_active_class(class) { p = class->pick_task(rq); if (p) return p; @@ -7060,12 +7138,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -static void __setscheduler_prio(struct task_struct *p, int prio) +void __setscheduler_prio(struct task_struct *p, int prio) { if (dl_prio(prio)) p->sched_class = &dl_sched_class; else if (rt_prio(prio)) p->sched_class = &rt_sched_class; +#ifdef CONFIG_SCHED_CLASS_EXT + else if (task_should_scx(p)) + p->sched_class = &ext_sched_class; +#endif else p->sched_class = &fair_sched_class; @@ -7226,6 +7308,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } __setscheduler_prio(p, prio); + check_class_changing(rq, p, prev_class); if (queued) enqueue_task(rq, p, queue_flag); @@ -7769,6 +7852,10 @@ static int __sched_setscheduler(struct task_struct *p, goto unlock; } + retval = scx_check_setscheduler(p, policy); + if (retval) + goto unlock; + /* * If not changing anything there's no need to proceed further, * but store a possible modification of reset_on_fork. @@ -7871,6 +7958,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_prio(p, newprio); } __setscheduler_uclamp(p, attr); + check_class_changing(rq, p, prev_class); if (queued) { /* @@ -9046,6 +9134,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: + case SCHED_EXT: ret = 0; break; } @@ -9073,6 +9162,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: + case SCHED_EXT: ret = 0; } return ret; @@ -9168,6 +9258,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); + print_scx_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -9553,7 +9644,7 @@ static inline void balance_hotplug_wait(void) #endif /* CONFIG_HOTPLUG_CPU */ -void set_rq_online(struct rq *rq) +void set_rq_online(struct rq *rq, enum rq_onoff_reason reason) { if (!rq->online) { const struct sched_class *class; @@ -9563,12 +9654,12 @@ void set_rq_online(struct rq *rq) for_each_class(class) { if (class->rq_online) - class->rq_online(rq); + class->rq_online(rq, reason); } } } -void set_rq_offline(struct rq *rq) +void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason) { if (rq->online) { const struct sched_class *class; @@ -9576,7 +9667,7 @@ void set_rq_offline(struct rq *rq) update_rq_clock(rq); for_each_class(class) { if (class->rq_offline) - class->rq_offline(rq); + class->rq_offline(rq, reason); } cpumask_clear_cpu(rq->cpu, rq->rd->online); @@ -9672,7 +9763,7 @@ int sched_cpu_activate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); + set_rq_online(rq, RQ_ONOFF_HOTPLUG); } rq_unlock_irqrestore(rq, &rf); @@ -9716,7 +9807,7 @@ int sched_cpu_deactivate(unsigned int cpu) rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); + set_rq_offline(rq, RQ_ONOFF_HOTPLUG); } rq_unlock_irqrestore(rq, &rf); @@ -9903,11 +9994,15 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class != &fair_sched_class + 1 || - &fair_sched_class != &rt_sched_class + 1 || - &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class != &stop_sched_class + 1); + BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); +#endif + BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); + BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); + BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); +#ifdef CONFIG_SCHED_CLASS_EXT + BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif wait_bit_init(); @@ -9931,6 +10026,9 @@ void __init sched_init(void) root_task_group.shares = ROOT_TASK_GROUP_LOAD; init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EXT_GROUP_SCHED + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; +#endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -10076,6 +10174,7 @@ void __init sched_init(void) balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); + init_sched_ext_class(); psi_init(); @@ -10361,6 +10460,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); alloc_uclamp_sched_group(tg, parent); return tg; @@ -10488,6 +10588,7 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); + scx_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10502,11 +10603,6 @@ void sched_move_task(struct task_struct *tsk) } } -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct task_group, css) : NULL; -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -10530,6 +10626,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); + int ret; + + ret = scx_tg_online(tg); + if (ret) + return ret; if (parent) sched_online_group(tg, parent); @@ -10544,6 +10645,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + scx_tg_offline(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -10561,9 +10669,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -#ifdef CONFIG_RT_GROUP_SCHED +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_RT_GROUP_SCHED struct task_struct *task; struct cgroup_subsys_state *css; @@ -10571,7 +10680,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } - return 0; +#endif + return scx_cgroup_can_attach(tset); } #endif @@ -10582,8 +10692,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + scx_cgroup_finish_attach(); } +#ifdef CONFIG_EXT_GROUP_SCHED +static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + scx_cgroup_cancel_attach(tset); +} +#endif + #ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css) { @@ -10762,9 +10881,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + int ret; + if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; - return sched_group_set_shares(css_tg(css), scale_load(shareval)); + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(shareval)); + return ret; } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, @@ -11161,7 +11286,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, } #endif -static struct cftype cpu_legacy_files[] = { +static struct cftype cpu_legacy_cftypes[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -11272,38 +11397,44 @@ static int cpu_local_stat_show(struct seq_file *sf, return 0; } +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + +static unsigned long tg_weight(struct task_group *tg) +{ #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); +#else + return sched_weight_from_cgroup(tg->scx_weight); +#endif +} + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - u64 weight = scale_load_down(tg->shares); - - return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); + return sched_weight_to_cgroup(tg_weight(css_tg(css))); } static int cpu_weight_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 weight) + struct cftype *cft, u64 cgrp_weight) { - /* - * cgroup weight knobs should use the common MIN, DFL and MAX - * values which are 1, 100 and 10000 respectively. While it loses - * a bit of range on both ends, it maps pretty well onto the shares - * value used by scheduler and the round-trip conversions preserve - * the original value over the entire range. - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; + int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), cgrp_weight); + return ret; } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - unsigned long weight = scale_load_down(css_tg(css)->shares); + unsigned long weight = tg_weight(css_tg(css)); int last_delta = INT_MAX; int prio, delta; @@ -11322,7 +11453,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; - int idx; + int idx, ret; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; @@ -11331,7 +11462,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(weight)); + return ret; } #endif @@ -11392,21 +11527,23 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, } #endif -static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED - { +struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = { +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + [CPU_CFTYPE_WEIGHT] = { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_weight_read_u64, .write_u64 = cpu_weight_write_u64, }, - { + [CPU_CFTYPE_WEIGHT_NICE] = { .name = "weight.nice", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, - { +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED + [CPU_CFTYPE_IDLE] = { .name = "idle", .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_idle_read_s64, @@ -11414,13 +11551,13 @@ static struct cftype cpu_files[] = { }, #endif #ifdef CONFIG_CFS_BANDWIDTH - { + [CPU_CFTYPE_MAX] = { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_max_show, .write = cpu_max_write, }, - { + [CPU_CFTYPE_MAX_BURST] = { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_cfs_burst_read_u64, @@ -11428,13 +11565,13 @@ static struct cftype cpu_files[] = { }, #endif #ifdef CONFIG_UCLAMP_TASK_GROUP - { + [CPU_CFTYPE_UCLAMP_MIN] = { .name = "uclamp.min", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_min_show, .write = cpu_uclamp_min_write, }, - { + [CPU_CFTYPE_UCLAMP_MAX] = { .name = "uclamp.max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_max_show, @@ -11447,16 +11584,20 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, .css_local_stat_show = cpu_local_stat_show, -#ifdef CONFIG_RT_GROUP_SCHED +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) .can_attach = cpu_cgroup_can_attach, #endif .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_legacy_files, - .dfl_cftypes = cpu_files, +#ifdef CONFIG_EXT_GROUP_SCHED + .cancel_attach = cpu_cgroup_cancel_attach, +#endif + .legacy_cftypes = cpu_legacy_cftypes, + .dfl_cftypes = cpu_cftypes, .early_init = true, .threaded = true, }; @@ -12044,3 +12185,38 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif + +#ifdef CONFIG_SCHED_CLASS_EXT +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + *ctx = (struct sched_enq_and_set_ctx){ + .p = p, + .queue_flags = queue_flags, + .queued = task_on_rq_queued(p), + .running = task_current(rq, p), + }; + + update_rq_clock(rq); + if (ctx->queued) + dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + if (ctx->running) + put_prev_task(rq, p); +} + +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(ctx->p); + + lockdep_assert_rq_held(rq); + + if (ctx->queued) + enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + if (ctx->running) + set_next_task(rq, ctx->p); +} +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af8cc4..010d1dc5f9182 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2607,7 +2607,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, } /* Assumes rq->lock is held */ -static void rq_online_dl(struct rq *rq) +static void rq_online_dl(struct rq *rq, enum rq_onoff_reason reason) { if (rq->dl.overloaded) dl_set_overload(rq); @@ -2618,7 +2618,7 @@ static void rq_online_dl(struct rq *rq) } /* Assumes rq->lock is held */ -static void rq_offline_dl(struct rq *rq) +static void rq_offline_dl(struct rq *rq, enum rq_onoff_reason reason) { if (rq->dl.overloaded) dl_clear_overload(rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8d5d98a5834df..6f306e1c9c3e0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1089,6 +1089,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(dl.runtime); P(dl.deadline); } +#ifdef CONFIG_SCHED_CLASS_EXT + __PS("ext.enabled", task_on_scx(p)); +#endif #undef PN_SCHEDSTAT #undef P_SCHEDSTAT diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c new file mode 100644 index 0000000000000..2d33efd9c3092 --- /dev/null +++ b/kernel/sched/ext.c @@ -0,0 +1,5071 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ + +#include + +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_internal_consts { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, +}; + +enum scx_ops_enable_state { + SCX_OPS_PREPPING, + SCX_OPS_ENABLING, + SCX_OPS_ENABLED, + SCX_OPS_DISABLING, + SCX_OPS_DISABLED, +}; + +static const char *scx_ops_enable_state_str[] = { + [SCX_OPS_PREPPING] = "prepping", + [SCX_OPS_ENABLING] = "enabling", + [SCX_OPS_ENABLED] = "enabled", + [SCX_OPS_DISABLING] = "disabling", + [SCX_OPS_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +/* + * During exit, a task may schedule after losing its PIDs. When disabling the + * BPF scheduler, we need to be able to iterate tasks in every state to + * guarantee system safety. Maintain a dedicated task list which contains every + * task between its fork and eventual free. + */ +static DEFINE_SPINLOCK(scx_tasks_lock); +static LIST_HEAD(scx_tasks); + +/* ops enable/disable */ +static struct kthread_worker *scx_ops_helper; +static DEFINE_MUTEX(scx_ops_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static bool scx_switch_all_req; +static bool scx_switching_all; +DEFINE_STATIC_KEY_FALSE(__scx_switched_all); + +static struct sched_ext_ops scx_ops; +static bool scx_warned_zero_slice; + +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +struct static_key_false scx_has_op[SCX_OPI_END] = + { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; + +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); +static struct scx_exit_info scx_exit_info; + +static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); + +/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_ops_error(). + */ +unsigned long scx_watchdog_timeout; + +/* + * The last time the delayed work was run. This delayed work relies on + * ksoftirqd being able to run to service timer interrupts, so it's possible + * that this work itself could get wedged. To account for this, we check that + * it's not stalled in the timer tick, and trigger an error if it is. + */ +unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; + +static struct delayed_work scx_watchdog_work; + +/* idle tracking */ +#ifdef CONFIG_SMP +#ifdef CONFIG_CPUMASK_OFFSTACK +#define CL_ALIGNED_IF_ONSTACK +#else +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp +#endif + +static struct { + cpumask_var_t cpu; + cpumask_var_t smt; +} idle_masks CL_ALIGNED_IF_ONSTACK; + +#endif /* CONFIG_SMP */ + +/* for %SCX_KICK_WAIT */ +static unsigned long __percpu *scx_kick_cpus_pnt_seqs; + +/* + * Direct dispatch marker. + * + * Non-NULL values are used for direct dispatch from enqueue path. A valid + * pointer points to the task currently being enqueued. An ERR_PTR value is used + * to indicate that direct dispatch has already happened. + */ +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); + +/* dispatch queues */ +static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; + +static const struct rhashtable_params dsq_hash_params = { + .key_len = 8, + .key_offset = offsetof(struct scx_dispatch_q, id), + .head_offset = offsetof(struct scx_dispatch_q, hash_node), +}; + +static struct rhashtable dsq_hash; +static LLIST_HEAD(dsqs_to_free); + +/* dispatch buf */ +struct scx_dsp_buf_ent { + struct task_struct *task; + unsigned long qseq; + u64 dsq_id; + u64 enq_flags; +}; + +static u32 scx_dsp_max_batch; +static struct scx_dsp_buf_ent __percpu *scx_dsp_buf; + +struct scx_dsp_ctx { + struct rq *rq; + struct rq_flags *rf; + u32 buf_cursor; + u32 nr_tasks; +}; + +static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx); + +/* /sys/kernel/sched_ext interface */ +static struct kset *scx_kset; +static struct kobject *scx_root_kobj; + +static void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags); +static void scx_bpf_kick_cpu(s32 cpu, u64 flags); + +struct scx_task_iter { + struct sched_ext_entity cursor; + struct task_struct *locked; + struct rq *rq; + struct rq_flags rf; +}; + +#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) + +/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ +static u32 higher_bits(u32 flags) +{ + return ~((1 << fls(flags)) - 1); +} + +/* return the mask with only the highest bit set */ +static u32 highest_bit(u32 flags) +{ + int bit = fls(flags); + return bit ? 1 << (bit - 1) : 0; +} + +/* + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate + * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check + * whether it's running from an allowed context. + * + * @mask is constant, always inline to cull the mask calculations. + */ +static __always_inline void scx_kf_allow(u32 mask) +{ + /* nesting is allowed only in increasing scx_kf_mask order */ + WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, + "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", + current->scx.kf_mask, mask); + current->scx.kf_mask |= mask; +} + +static void scx_kf_disallow(u32 mask) +{ + current->scx.kf_mask &= ~mask; +} + +#define SCX_CALL_OP(mask, op, args...) \ +do { \ + if (mask) { \ + scx_kf_allow(mask); \ + scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + scx_ops.op(args); \ + } \ +} while (0) + +#define SCX_CALL_OP_RET(mask, op, args...) \ +({ \ + __typeof__(scx_ops.op(args)) __ret; \ + if (mask) { \ + scx_kf_allow(mask); \ + __ret = scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + __ret = scx_ops.op(args); \ + } \ + __ret; \ +}) + +/* + * Some kfuncs are allowed only on the tasks that are subjects of the + * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such + * restrictions, the following SCX_CALL_OP_*() variants should be used when + * invoking scx_ops operations that take task arguments. These can only be used + * for non-nesting operations due to the way the tasks are tracked. + * + * kfuncs which can only operate on such tasks can in turn use + * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on + * the specific task. + */ +#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +do { \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + SCX_CALL_OP(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ +} while (0) + +#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +({ \ + __typeof__(scx_ops.op(task, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + __ret; \ +}) + +#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +({ \ + __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task0; \ + current->scx.kf_tasks[1] = task1; \ + __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + current->scx.kf_tasks[1] = NULL; \ + __ret; \ +}) + +/* @mask is constant, always inline to cull unnecessary branches */ +static __always_inline bool scx_kf_allowed(u32 mask) +{ + if (unlikely(!(current->scx.kf_mask & mask))) { + scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); + return false; + } + + if (unlikely((mask & (SCX_KF_INIT | SCX_KF_SLEEPABLE)) && + in_interrupt())) { + scx_ops_error("sleepable kfunc called from non-sleepable context"); + return false; + } + + /* + * Enforce nesting boundaries. e.g. A kfunc which can be called from + * DISPATCH must not be called if we're running DEQUEUE which is nested + * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE + * boundary thanks to the above in_interrupt() check. + */ + if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && + (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { + scx_ops_error("cpu_release kfunc called from a nested operation"); + return false; + } + + if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && + (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { + scx_ops_error("dispatch kfunc called from a nested operation"); + return false; + } + + return true; +} + +/* see SCX_CALL_OP_TASK() */ +static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, + struct task_struct *p) +{ + if (!scx_kf_allowed(__SCX_KF_RQ_LOCKED)) + return false; + + if (unlikely((p != current->scx.kf_tasks[0] && + p != current->scx.kf_tasks[1]))) { + scx_ops_error("called on a task not being operated on"); + return false; + } + + return true; +} + +/** + * scx_task_iter_init - Initialize a task iterator + * @iter: iterator to init + * + * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, + * @iter must eventually be exited with scx_task_iter_exit(). + * + * scx_tasks_lock may be released between this and the first next() call or + * between any two next() calls. If scx_tasks_lock is released between two + * next() calls, the caller is responsible for ensuring that the task being + * iterated remains accessible either through RCU read lock or obtaining a + * reference count. + * + * All tasks which existed when the iteration started are guaranteed to be + * visited as long as they still exist. + */ +static void scx_task_iter_init(struct scx_task_iter *iter) +{ + lockdep_assert_held(&scx_tasks_lock); + + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + list_add(&iter->cursor.tasks_node, &scx_tasks); + iter->locked = NULL; +} + +/** + * scx_task_iter_exit - Exit a task iterator + * @iter: iterator to exit + * + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. + * If the iterator holds a task's rq lock, that rq lock is released. See + * scx_task_iter_init() for details. + */ +static void scx_task_iter_exit(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + + lockdep_assert_held(&scx_tasks_lock); + + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } + + if (list_empty(cursor)) + return; + + list_del_init(cursor); +} + +/** + * scx_task_iter_next - Next task + * @iter: iterator to walk + * + * Visit the next task. See scx_task_iter_init() for details. + */ +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + struct sched_ext_entity *pos; + + lockdep_assert_held(&scx_tasks_lock); + + list_for_each_entry(pos, cursor, tasks_node) { + if (&pos->tasks_node == &scx_tasks) + return NULL; + if (!(pos->flags & SCX_TASK_CURSOR)) { + list_move(cursor, &pos->tasks_node); + return container_of(pos, struct task_struct, scx); + } + } + + /* can't happen, should always terminate at scx_tasks above */ + BUG(); +} + +/** + * scx_task_iter_next_filtered - Next non-idle task + * @iter: iterator to walk + * + * Visit the next non-idle task. See scx_task_iter_init() for details. + */ +static struct task_struct * +scx_task_iter_next_filtered(struct scx_task_iter *iter) +{ + struct task_struct *p; + + while ((p = scx_task_iter_next(iter))) { + /* + * is_idle_task() tests %PF_IDLE which may not be set for CPUs + * which haven't yet been onlined. Test sched_class directly. + */ + if (p->sched_class != &idle_sched_class) + return p; + } + return NULL; +} + +/** + * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked + * @iter: iterator to walk + * + * Visit the next non-idle task with its rq lock held. See scx_task_iter_init() + * for details. + */ +static struct task_struct * +scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) +{ + struct task_struct *p; + + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } + + p = scx_task_iter_next_filtered(iter); + if (!p) + return NULL; + + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked = p; + return p; +} + +static enum scx_ops_enable_state scx_ops_enable_state(void) +{ + return atomic_read(&scx_ops_enable_state_var); +} + +static enum scx_ops_enable_state +scx_ops_set_enable_state(enum scx_ops_enable_state to) +{ + return atomic_xchg(&scx_ops_enable_state_var, to); +} + +static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, + enum scx_ops_enable_state from) +{ + int from_v = from; + + return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); +} + +static bool scx_ops_bypassing(void) +{ + return unlikely(atomic_read(&scx_ops_bypass_depth)); +} + +/** + * wait_ops_state - Busy-wait the specified ops state to end + * @p: target task + * @opss: state to wait the end of + * + * Busy-wait for @p to transition out of @opss. This can only be used when the + * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also + * has load_acquire semantics to ensure that the caller can see the updates made + * in the enqueueing and dispatching paths. + */ +static void wait_ops_state(struct task_struct *p, unsigned long opss) +{ + do { + cpu_relax(); + } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); +} + +/** + * ops_cpu_valid - Verify a cpu number + * @cpu: cpu number which came from a BPF ops + * + * @cpu is a cpu number which came from the BPF scheduler and can be any value. + * Verify that it is in range and one of the possible cpus. + */ +static bool ops_cpu_valid(s32 cpu) +{ + return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); +} + +/** + * ops_sanitize_err - Sanitize a -errno value + * @ops_name: operation to blame on failure + * @err: -errno value to sanitize + * + * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return + * -%EPROTO. This is necessary because returning a rogue -errno up the chain + * can cause misbehaviors. For an example, a large negative return from + * ops.init_task() triggers an oops when passed up the call chain because the + * value fails IS_ERR() test after being encoded with ERR_PTR() and then is + * handled as a pointer. + */ +static int ops_sanitize_err(const char *ops_name, s32 err) +{ + if (err < 0 && err >= -MAX_ERRNO) + return err; + + scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); + return -EPROTO; +} + +/** + * touch_core_sched - Update timestamp used for core-sched task ordering + * @rq: rq to read clock from, must be locked + * @p: task to update the timestamp for + * + * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to + * implement global or local-DSQ FIFO ordering for core-sched. Should be called + * when a task becomes runnable and its turn on the CPU ends (e.g. slice + * exhaustion). + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). + */ + if (!sched_core_disabled()) + p->scx.core_sched_at = rq_clock_task(rq); +#endif +} + +/** + * touch_core_sched_dispatch - Update core-sched timestamp on dispatch + * @rq: rq to read clock from, must be locked + * @p: task being dispatched + * + * If the BPF scheduler implements custom core-sched ordering via + * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO + * ordering within each local DSQ. This function is called from dispatch paths + * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. + */ +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + +#ifdef CONFIG_SCHED_CORE + if (SCX_HAS_OP(core_sched_before)) + touch_core_sched(rq, p); +#endif +} + +static void update_curr_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + u64 now = rq_clock_task(rq); + u64 delta_exec; + + if (time_before_eq64(now, curr->se.exec_start)) + return; + + delta_exec = now - curr->se.exec_start; + curr->se.exec_start = now; + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); + + if (curr->scx.slice != SCX_SLICE_INF) { + curr->scx.slice -= min(curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } +} + +static bool scx_dsq_priq_less(struct rb_node *node_a, + const struct rb_node *node_b) +{ + const struct task_struct *a = + container_of(node_a, struct task_struct, scx.dsq_node.priq); + const struct task_struct *b = + container_of(node_b, struct task_struct, scx.dsq_node.priq); + + return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); +} + +static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + bool is_local = dsq->id == SCX_DSQ_LOCAL; + + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo)); + WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); + + if (!is_local) { + raw_spin_lock(&dsq->lock); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { + scx_ops_error("attempting to dispatch to a destroyed dsq"); + /* fall back to the global dsq */ + raw_spin_unlock(&dsq->lock); + dsq = &scx_dsq_global; + raw_spin_lock(&dsq->lock); + } + } + + if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && + (enq_flags & SCX_ENQ_DSQ_PRIQ))) { + /* + * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from + * their FIFO queues. To avoid confusion and accidentally + * starving vtime-dispatched tasks by FIFO-dispatched tasks, we + * disallow any internal DSQ from doing vtime ordering of + * tasks. + */ + scx_ops_error("Cannot use vtime ordering for built-in DSQs"); + enq_flags &= ~SCX_ENQ_DSQ_PRIQ; + } + + if (enq_flags & SCX_ENQ_DSQ_PRIQ) { + p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq, + scx_dsq_priq_less); + /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ + if (unlikely(!list_empty(&dsq->fifo))) + scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); + } else { + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_node.fifo, &dsq->fifo); + else + list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo); + /* A DSQ should only be using either FIFO or PRIQ enqueuing. */ + if (unlikely(rb_first_cached(&dsq->priq))) + scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + } + dsq->nr++; + p->scx.dsq = dsq; + + /* + * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the + * direct dispatch path, but we clear them here because the direct + * dispatch verdict may be overridden on the enqueue path during e.g. + * bypass. + */ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; + + /* + * We're transitioning out of QUEUEING or DISPATCHING. store_release to + * match waiters' load_acquire. + */ + if (enq_flags & SCX_ENQ_CLEAR_OPSS) + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + if (is_local) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, + rq->curr->sched_class)) + resched_curr(rq); + } else { + raw_spin_unlock(&dsq->lock); + } +} + +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; + } else { + list_del_init(&p->scx.dsq_node.fifo); + } +} + +static bool task_linked_on_dsq(struct task_struct *p) +{ + return !list_empty(&p->scx.dsq_node.fifo) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq); +} + +static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p) +{ + struct scx_dispatch_q *dsq = p->scx.dsq; + bool is_local = dsq == &scx_rq->local_dsq; + + if (!dsq) { + WARN_ON_ONCE(task_linked_on_dsq(p)); + /* + * When dispatching directly from the BPF scheduler to a local + * DSQ, the task isn't associated with any DSQ but + * @p->scx.holding_cpu may be set under the protection of + * %SCX_OPSS_DISPATCHING. + */ + if (p->scx.holding_cpu >= 0) + p->scx.holding_cpu = -1; + return; + } + + if (!is_local) + raw_spin_lock(&dsq->lock); + + /* + * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node + * can't change underneath us. + */ + if (p->scx.holding_cpu < 0) { + /* @p must still be on @dsq, dequeue */ + WARN_ON_ONCE(!task_linked_on_dsq(p)); + task_unlink_from_dsq(p, dsq); + dsq->nr--; + } else { + /* + * We're racing against dispatch_to_local_dsq() which already + * removed @p from @dsq and set @p->scx.holding_cpu. Clear the + * holding_cpu which tells dispatch_to_local_dsq() that it lost + * the race. + */ + WARN_ON_ONCE(task_linked_on_dsq(p)); + p->scx.holding_cpu = -1; + } + p->scx.dsq = NULL; + + if (!is_local) + raw_spin_unlock(&dsq->lock); +} + +static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) +{ + lockdep_assert(rcu_read_lock_any_held()); + + if (dsq_id == SCX_DSQ_GLOBAL) + return &scx_dsq_global; + else + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, + dsq_hash_params); +} + +static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, + struct task_struct *p) +{ + struct scx_dispatch_q *dsq; + + if (dsq_id == SCX_DSQ_LOCAL) + return &rq->scx.local_dsq; + + dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); + return &scx_dsq_global; + } + + return dsq; +} + +static void mark_direct_dispatch(struct task_struct *ddsp_task, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + /* + * Mark that dispatch already happened from ops.select_cpu() or + * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value + * which can never match a valid task pointer. + */ + __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); + + /* @p must match the task on the enqueue path */ + if (unlikely(p != ddsp_task)) { + if (IS_ERR(ddsp_task)) + scx_ops_error("%s[%d] already direct-dispatched", + p->comm, p->pid); + else + scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); + return; + } + + /* + * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because + * dispatching to the local DSQ of a different CPU requires unlocking + * the current rq which isn't allowed in the enqueue path. Use + * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. + */ + if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { + scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); + return; + } + + WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); + WARN_ON_ONCE(p->scx.ddsp_enq_flags); + + p->scx.ddsp_dsq_id = dsq_id; + p->scx.ddsp_enq_flags = enq_flags; +} + +static void direct_dispatch(struct task_struct *p, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + + touch_core_sched_dispatch(task_rq(p), p); + + enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); + dispatch_enqueue(dsq, p, enq_flags); +} + +static bool test_rq_online(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->online; +#else + return true; +#endif +} + +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, + int sticky_cpu) +{ + struct task_struct **ddsp_taskp; + unsigned long qseq; + + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + + /* rq migration */ + if (sticky_cpu == cpu_of(rq)) + goto local_norefill; + + /* + * If !rq->online, we already told the BPF scheduler that the CPU is + * offline. We're just trying to on/offline the CPU. Don't bother the + * BPF scheduler. + */ + if (unlikely(!test_rq_online(rq))) + goto local; + + if (scx_ops_bypassing()) { + if (enq_flags & SCX_ENQ_LAST) + goto local; + else + goto global; + } + + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* see %SCX_OPS_ENQ_EXITING */ + if (!static_branch_unlikely(&scx_ops_enq_exiting) && + unlikely(p->flags & PF_EXITING)) + goto local; + + /* see %SCX_OPS_ENQ_LAST */ + if (!static_branch_unlikely(&scx_ops_enq_last) && + (enq_flags & SCX_ENQ_LAST)) + goto local; + + if (!SCX_HAS_OP(enqueue)) + goto global; + + /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ + qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; + + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + + *ddsp_taskp = NULL; + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* + * If not directly dispatched, QUEUEING isn't clear yet and dispatch or + * dequeue may be waiting. The store_release matches their load_acquire. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); + return; + +direct: + direct_dispatch(p, enq_flags); + return; + +local: + /* + * For task-ordering, slice refill must be treated as implying the end + * of the current slice. Otherwise, the longer @p stays on the CPU, the + * higher priority it becomes from scx_prio_less()'s POV. + */ + touch_core_sched(rq, p); + p->scx.slice = SCX_SLICE_DFL; +local_norefill: + dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); + return; + +global: + touch_core_sched(rq, p); /* see the comment in local: */ + p->scx.slice = SCX_SLICE_DFL; + dispatch_enqueue(&scx_dsq_global, p, enq_flags); +} + +static bool task_runnable(const struct task_struct *p) +{ + return !list_empty(&p->scx.runnable_node); +} + +static void set_task_runnable(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { + p->scx.runnable_at = jiffies; + p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; + } + + /* + * list_add_tail() must be used. scx_ops_bypass() depends on tasks being + * appened to the runnable_list. + */ + list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); +} + +static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) +{ + list_del_init(&p->scx.runnable_node); + if (reset_runnable_at) + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; +} + +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) +{ + int sticky_cpu = p->scx.sticky_cpu; + + enq_flags |= rq->scx.extra_enq_flags; + + if (sticky_cpu >= 0) + p->scx.sticky_cpu = -1; + + /* + * Restoring a running task will be immediately followed by + * set_next_task_scx() which expects the task to not be on the BPF + * scheduler as tasks can only start running through local DSQs. Force + * direct-dispatch into the local DSQ by setting the sticky_cpu. + */ + if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) + sticky_cpu = cpu_of(rq); + + if (p->scx.flags & SCX_TASK_QUEUED) { + WARN_ON_ONCE(!task_runnable(p)); + return; + } + + set_task_runnable(rq, p); + p->scx.flags |= SCX_TASK_QUEUED; + rq->scx.nr_running++; + add_nr_running(rq, 1); + + if (SCX_HAS_OP(runnable)) + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + + if (enq_flags & SCX_ENQ_WAKEUP) + touch_core_sched(rq, p); + + do_enqueue_task(rq, p, enq_flags, sticky_cpu); +} + +static void ops_dequeue(struct task_struct *p, u64 deq_flags) +{ + unsigned long opss; + + /* dequeue is always temporary, don't reset runnable_at */ + clr_task_runnable(p, false); + + /* acquire ensures that we see the preceding updates on QUEUED */ + opss = atomic_long_read_acquire(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * QUEUEING is started and finished while holding @p's rq lock. + * As we're holding the rq lock now, we shouldn't see QUEUEING. + */ + BUG(); + case SCX_OPSS_QUEUED: + if (SCX_HAS_OP(dequeue)) + SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + + if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_NONE)) + break; + fallthrough; + case SCX_OPSS_DISPATCHING: + /* + * If @p is being dispatched from the BPF scheduler to a DSQ, + * wait for the transfer to complete so that @p doesn't get + * added to its DSQ after dequeueing is complete. + * + * As we're waiting on DISPATCHING with the rq locked, the + * dispatching side shouldn't try to lock the rq while + * DISPATCHING is set. See dispatch_to_local_dsq(). + * + * DISPATCHING shouldn't have qseq set and control can reach + * here with NONE @opss from the above QUEUED case block. + * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. + */ + wait_ops_state(p, SCX_OPSS_DISPATCHING); + BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + break; + } +} + +static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) +{ + struct scx_rq *scx_rq = &rq->scx; + + if (!(p->scx.flags & SCX_TASK_QUEUED)) { + WARN_ON_ONCE(task_runnable(p)); + return; + } + + ops_dequeue(p, deq_flags); + + /* + * A currently running task which is going off @rq first gets dequeued + * and then stops running. As we want running <-> stopping transitions + * to be contained within runnable <-> quiescent transitions, trigger + * ->stopping() early here instead of in put_prev_task_scx(). + * + * @p may go through multiple stopping <-> running transitions between + * here and put_prev_task_scx() if task attribute changes occur while + * balance_scx() leaves @rq unlocked. However, they don't contain any + * information meaningful to the BPF scheduler and can be suppressed by + * skipping the callbacks if the task is !QUEUED. + */ + if (SCX_HAS_OP(stopping) && task_current(rq, p)) { + update_curr_scx(rq); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + } + + if (SCX_HAS_OP(quiescent)) + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + + if (deq_flags & SCX_DEQ_SLEEP) + p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; + else + p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; + + p->scx.flags &= ~SCX_TASK_QUEUED; + scx_rq->nr_running--; + sub_nr_running(rq, 1); + + dispatch_dequeue(scx_rq, p); +} + +static void yield_task_scx(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (SCX_HAS_OP(yield)) + SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + else + p->scx.slice = 0; +} + +static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) +{ + struct task_struct *from = rq->curr; + + if (SCX_HAS_OP(yield)) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + else + return false; +} + +#ifdef CONFIG_SMP +/** + * move_task_to_local_dsq - Move a task from a different rq to a local DSQ + * @rq: rq to move the task into, currently locked + * @p: task to move + * @enq_flags: %SCX_ENQ_* + * + * Move @p which is currently on a different rq to @rq's local DSQ. The caller + * must: + * + * 1. Start with exclusive access to @p either through its DSQ lock or + * %SCX_OPSS_DISPATCHING flag. + * + * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). + * + * 3. Remember task_rq(@p). Release the exclusive access so that we don't + * deadlock with dequeue. + * + * 4. Lock @rq and the task_rq from #3. + * + * 5. Call this function. + * + * Returns %true if @p was successfully moved. %false after racing dequeue and + * losing. + */ +static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, + u64 enq_flags) +{ + struct rq *task_rq; + + lockdep_assert_rq_held(rq); + + /* + * If dequeue got to @p while we were trying to lock both rq's, it'd + * have cleared @p->scx.holding_cpu to -1. While other cpus may have + * updated it to different values afterwards, as this operation can't be + * preempted or recurse, @p->scx.holding_cpu can never become + * raw_smp_processor_id() again before we're done. Thus, we can tell + * whether we lost to dequeue by testing whether @p->scx.holding_cpu is + * still raw_smp_processor_id(). + * + * See dispatch_dequeue() for the counterpart. + */ + if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) + return false; + + /* @p->rq couldn't have changed if we're still the holding cpu */ + task_rq = task_rq(p); + lockdep_assert_rq_held(task_rq); + + WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); + deactivate_task(task_rq, p, 0); + set_task_cpu(p, cpu_of(rq)); + p->scx.sticky_cpu = cpu_of(rq); + + /* + * We want to pass scx-specific enq_flags but activate_task() will + * truncate the upper 32 bit. As we own @rq, we can pass them through + * @rq->scx.extra_enq_flags instead. + */ + WARN_ON_ONCE(rq->scx.extra_enq_flags); + rq->scx.extra_enq_flags = enq_flags; + activate_task(rq, p, 0); + rq->scx.extra_enq_flags = 0; + + return true; +} + +/** + * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @src_rq: rq to move task from + * @dst_rq: rq to move task to + * + * We're holding @rq lock and trying to dispatch a task from @src_rq to + * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether + * @rq stays locked isn't important as long as the state is restored after + * dispatch_to_local_dsq_unlock(). + */ +static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, + struct rq *src_rq, struct rq *dst_rq) +{ + rq_unpin_lock(rq, rf); + + if (src_rq == dst_rq) { + raw_spin_rq_unlock(rq); + raw_spin_rq_lock(dst_rq); + } else if (rq == src_rq) { + double_lock_balance(rq, dst_rq); + rq_repin_lock(rq, rf); + } else if (rq == dst_rq) { + double_lock_balance(rq, src_rq); + rq_repin_lock(rq, rf); + } else { + raw_spin_rq_unlock(rq); + double_rq_lock(src_rq, dst_rq); + } +} + +/** + * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @src_rq: rq to move task from + * @dst_rq: rq to move task to + * + * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. + */ +static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, + struct rq *src_rq, struct rq *dst_rq) +{ + if (src_rq == dst_rq) { + raw_spin_rq_unlock(dst_rq); + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + } else if (rq == src_rq) { + double_unlock_balance(rq, dst_rq); + } else if (rq == dst_rq) { + double_unlock_balance(rq, src_rq); + } else { + double_rq_unlock(src_rq, dst_rq); + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + } +} +#endif /* CONFIG_SMP */ + + +static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq) +{ + return likely(test_rq_online(rq)) && !is_migration_disabled(p) && + cpumask_test_cpu(cpu_of(rq), p->cpus_ptr); +} + +static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, + struct scx_dispatch_q *dsq) +{ + struct scx_rq *scx_rq = &rq->scx; + struct task_struct *p; + struct rb_node *rb_node; + struct rq *task_rq; + bool moved = false; +retry: + if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq)) + return false; + + raw_spin_lock(&dsq->lock); + + list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) { + task_rq = task_rq(p); + if (rq == task_rq) + goto this_rq; + if (task_can_run_on_rq(p, rq)) + goto remote_rq; + } + + for (rb_node = rb_first_cached(&dsq->priq); rb_node; + rb_node = rb_next(rb_node)) { + p = container_of(rb_node, struct task_struct, scx.dsq_node.priq); + task_rq = task_rq(p); + if (rq == task_rq) + goto this_rq; + if (task_can_run_on_rq(p, rq)) + goto remote_rq; + } + + raw_spin_unlock(&dsq->lock); + return false; + +this_rq: + /* @dsq is locked and @p is on this rq */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo); + dsq->nr--; + scx_rq->local_dsq.nr++; + p->scx.dsq = &scx_rq->local_dsq; + raw_spin_unlock(&dsq->lock); + return true; + +remote_rq: +#ifdef CONFIG_SMP + /* + * @dsq is locked and @p is on a remote rq. @p is currently protected by + * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab + * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the + * rq lock or fail, do a little dancing from our side. See + * move_task_to_local_dsq(). + */ + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + dsq->nr--; + p->scx.holding_cpu = raw_smp_processor_id(); + raw_spin_unlock(&dsq->lock); + + rq_unpin_lock(rq, rf); + double_lock_balance(rq, task_rq); + rq_repin_lock(rq, rf); + + moved = move_task_to_local_dsq(rq, p, 0); + + double_unlock_balance(rq, task_rq); +#endif /* CONFIG_SMP */ + if (likely(moved)) + return true; + goto retry; +} + +enum dispatch_to_local_dsq_ret { + DTL_DISPATCHED, /* successfully dispatched */ + DTL_LOST, /* lost race to dequeue */ + DTL_NOT_LOCAL, /* destination is not a local DSQ */ + DTL_INVALID, /* invalid local dsq_id */ +}; + +/** + * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @dsq_id: destination dsq ID + * @p: task to dispatch + * @enq_flags: %SCX_ENQ_* + * + * We're holding @rq lock and want to dispatch @p to the local DSQ identified by + * @dsq_id. This function performs all the synchronization dancing needed + * because local DSQs are protected with rq locks. + * + * The caller must have exclusive ownership of @p (e.g. through + * %SCX_OPSS_DISPATCHING). + */ +static enum dispatch_to_local_dsq_ret +dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, + struct task_struct *p, u64 enq_flags) +{ + struct rq *src_rq = task_rq(p); + struct rq *dst_rq; + + /* + * We're synchronized against dequeue through DISPATCHING. As @p can't + * be dequeued, its task_rq and cpus_allowed are stable too. + */ + if (dsq_id == SCX_DSQ_LOCAL) { + dst_rq = rq; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (!ops_cpu_valid(cpu)) { + scx_ops_error("invalid cpu %d in SCX_DSQ_LOCAL_ON verdict for %s[%d]", + cpu, p->comm, p->pid); + return DTL_INVALID; + } + dst_rq = cpu_rq(cpu); + } else { + return DTL_NOT_LOCAL; + } + + /* if dispatching to @rq that @p is already on, no lock dancing needed */ + if (rq == src_rq && rq == dst_rq) { + dispatch_enqueue(&dst_rq->scx.local_dsq, p, + enq_flags | SCX_ENQ_CLEAR_OPSS); + return DTL_DISPATCHED; + } + +#ifdef CONFIG_SMP + if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { + struct rq *locked_dst_rq = dst_rq; + bool dsp; + + /* + * @p is on a possibly remote @src_rq which we need to lock to + * move the task. If dequeue is in progress, it'd be locking + * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq + * lock while holding DISPATCHING. + * + * As DISPATCHING guarantees that @p is wholly ours, we can + * pretend that we're moving from a DSQ and use the same + * mechanism - mark the task under transfer with holding_cpu, + * release DISPATCHING and then follow the same protocol. + */ + p->scx.holding_cpu = raw_smp_processor_id(); + + /* store_release ensures that dequeue sees the above */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); + + /* + * We don't require the BPF scheduler to avoid dispatching to + * offline CPUs mostly for convenience but also because CPUs can + * go offline between scx_bpf_dispatch() calls and here. If @p + * is destined to an offline CPU, queue it on its current CPU + * instead, which should always be safe. As this is an allowed + * behavior, don't trigger an ops error. + */ + if (unlikely(!test_rq_online(dst_rq))) + dst_rq = src_rq; + + if (src_rq == dst_rq) { + /* + * As @p is staying on the same rq, there's no need to + * go through the full deactivate/activate cycle. + * Optimize by abbreviating the operations in + * move_task_to_local_dsq(). + */ + dsp = p->scx.holding_cpu == raw_smp_processor_id(); + if (likely(dsp)) { + p->scx.holding_cpu = -1; + dispatch_enqueue(&dst_rq->scx.local_dsq, p, + enq_flags); + } + } else { + dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); + } + + /* if the destination CPU is idle, wake it up */ + if (dsp && p->sched_class > dst_rq->curr->sched_class) + resched_curr(dst_rq); + + dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); + + return dsp ? DTL_DISPATCHED : DTL_LOST; + } +#endif /* CONFIG_SMP */ + + scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", + cpu_of(dst_rq), p->comm, p->pid); + return DTL_INVALID; +} + +/** + * finish_dispatch - Asynchronously finish dispatching a task + * @rq: current rq which is locked + * @rf: rq_flags to use when unlocking @rq + * @p: task to finish dispatching + * @qseq_at_dispatch: qseq when @p started getting dispatched + * @dsq_id: destination DSQ ID + * @enq_flags: %SCX_ENQ_* + * + * Dispatching to local DSQs may need to wait for queueing to complete or + * require rq lock dancing. As we don't wanna do either while inside + * ops.dispatch() to avoid locking order inversion, we split dispatching into + * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the + * task and its qseq. Once ops.dispatch() returns, this function is called to + * finish up. + * + * There is no guarantee that @p is still valid for dispatching or even that it + * was valid in the first place. Make sure that the task is still owned by the + * BPF scheduler and claim the ownership before dispatching. + */ +static void finish_dispatch(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, + unsigned long qseq_at_dispatch, + u64 dsq_id, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + unsigned long opss; + + touch_core_sched_dispatch(rq, p); +retry: + /* + * No need for _acquire here. @p is accessed only after a successful + * try_cmpxchg to DISPATCHING. + */ + opss = atomic_long_read(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_DISPATCHING: + case SCX_OPSS_NONE: + /* someone else already got to it */ + return; + case SCX_OPSS_QUEUED: + /* + * If qseq doesn't match, @p has gone through at least one + * dispatch/dequeue and re-enqueue cycle between + * scx_bpf_dispatch() and here and we have no claim on it. + */ + if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) + return; + + /* + * While we know @p is accessible, we don't yet have a claim on + * it - the BPF scheduler is allowed to dispatch tasks + * spuriously and there can be a racing dequeue attempt. Let's + * claim @p by atomically transitioning it from QUEUED to + * DISPATCHING. + */ + if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_DISPATCHING))) + break; + goto retry; + case SCX_OPSS_QUEUEING: + /* + * do_enqueue_task() is in the process of transferring the task + * to the BPF scheduler while holding @p's rq lock. As we aren't + * holding any kernel or BPF resource that the enqueue path may + * depend upon, it's safe to wait. + */ + wait_ops_state(p, opss); + goto retry; + } + + BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); + + switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { + case DTL_DISPATCHED: + break; + case DTL_LOST: + break; + case DTL_INVALID: + dsq_id = SCX_DSQ_GLOBAL; + fallthrough; + case DTL_NOT_LOCAL: + dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), + dsq_id, p); + dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + break; + } +} + +static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + u32 u; + + for (u = 0; u < dspc->buf_cursor; u++) { + struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u]; + + finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, + ent->enq_flags); + } + + dspc->nr_tasks += dspc->buf_cursor; + dspc->buf_cursor = 0; +} + +static int balance_one(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf, bool local) +{ + struct scx_rq *scx_rq = &rq->scx; + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + bool prev_on_scx = prev->sched_class == &ext_sched_class; + int nr_loops = SCX_DSP_MAX_LOOPS; + + lockdep_assert_rq_held(rq); + + if (static_branch_unlikely(&scx_ops_cpu_preempt) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in scx_notify_pick_next_task(). + */ + if (SCX_HAS_OP(cpu_acquire)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq), + NULL); + rq->scx.cpu_released = false; + } + + if (prev_on_scx) { + WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); + update_curr_scx(rq); + + /* + * If @prev is runnable & has slice left, it has priority and + * fetching more just increases latency for the fetched tasks. + * Tell put_prev_task_scx() to put @prev on local_dsq. If the + * BPF scheduler wants to handle this explicitly, it should + * implement ->cpu_released(). + * + * See scx_ops_disable_workfn() for the explanation on the + * disabling() test. + * + * When balancing a remote CPU for core-sched, there won't be a + * following put_prev_task_scx() call and we don't own + * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the + * same conditions later and pick @rq->curr accordingly. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && + prev->scx.slice && !scx_ops_bypassing()) { + if (local) + prev->scx.flags |= SCX_TASK_BAL_KEEP; + return 1; + } + } + + /* if there already are tasks to run, nothing to do */ + if (scx_rq->local_dsq.nr) + return 1; + + if (consume_dispatch_q(rq, rf, &scx_dsq_global)) + return 1; + + if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing()) + return 0; + + dspc->rq = rq; + dspc->rf = rf; + + /* + * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, + * the local DSQ might still end up empty after a successful + * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() + * produced some tasks, retry. The BPF scheduler may depend on this + * looping behavior to simplify its implementation. + */ + do { + dspc->nr_tasks = 0; + + SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), + prev_on_scx ? prev : NULL); + + flush_dispatch_buf(rq, rf); + + if (scx_rq->local_dsq.nr) + return 1; + if (consume_dispatch_q(rq, rf, &scx_dsq_global)) + return 1; + + /* + * ops.dispatch() can trap us in this loop by repeatedly + * dispatching ineligible tasks. Break out once in a while to + * allow the watchdog to run. As IRQ can't be enabled in + * balance(), we want to complete this scheduling cycle and then + * start a new one. IOW, we want to call resched_curr() on the + * next, most likely idle, task, not the current one. Use + * scx_bpf_kick_cpu() for deferred kicking. + */ + if (unlikely(!--nr_loops)) { + scx_bpf_kick_cpu(cpu_of(rq), 0); + break; + } + } while (dspc->nr_tasks); + + return 0; +} + +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ + int ret; + + ret = balance_one(rq, prev, rf, true); + +#ifdef CONFIG_SCHED_SMT + /* + * When core-sched is enabled, this ops.balance() call will be followed + * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() + * on the SMT siblings. Balance the siblings too. + */ + if (sched_core_enabled(rq)) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + int scpu; + + for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { + struct rq *srq = cpu_rq(scpu); + struct rq_flags srf; + struct task_struct *sprev = srq->curr; + + /* + * While core-scheduling, rq lock is shared among + * siblings but the debug annotations and rq clock + * aren't. Do pinning dance to transfer the ownership. + */ + WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); + rq_unpin_lock(rq, rf); + rq_pin_lock(srq, &srf); + + update_rq_clock(srq); + balance_one(srq, sprev, &srf, false); + + rq_unpin_lock(srq, &srf); + rq_repin_lock(rq, rf); + } + } +#endif + return ret; +} + +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) +{ + if (p->scx.flags & SCX_TASK_QUEUED) { + /* + * Core-sched might decide to execute @p before it is + * dispatched. Call ops_dequeue() to notify the BPF scheduler. + */ + ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + dispatch_dequeue(&rq->scx, p); + } + + p->se.exec_start = rq_clock_task(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + + clr_task_runnable(p, true); + + /* + * @p is getting newly scheduled or got kicked after someone updated its + * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). + */ + if ((p->scx.slice == SCX_SLICE_INF) != + (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { + if (p->scx.slice == SCX_SLICE_INF) + rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; + else + rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; + + sched_update_tick_dependency(rq); + } +} + +static void put_prev_task_scx(struct rq *rq, struct task_struct *p) +{ +#ifndef CONFIG_SMP + /* + * UP workaround. + * + * Because SCX may transfer tasks across CPUs during dispatch, dispatch + * is performed from its balance operation which isn't called in UP. + * Let's work around by calling it from the operations which come right + * after. + * + * 1. If the prev task is on SCX, pick_next_task() calls + * .put_prev_task() right after. As .put_prev_task() is also called + * from other places, we need to distinguish the calls which can be + * done by looking at the previous task's state - if still queued or + * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). + * This case is handled here. + * + * 2. If the prev task is not on SCX, the first following call into SCX + * will be .pick_next_task(), which is covered by calling + * balance_scx() from pick_next_task_scx(). + * + * Note that we can't merge the first case into the second as + * balance_scx() must be called before the previous SCX task goes + * through put_prev_task_scx(). + * + * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. + * Pass in %NULL. + */ + if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) + balance_scx(rq, p, NULL); +#endif + + update_curr_scx(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + + /* + * If we're being called from put_prev_task_balance(), balance_scx() may + * have decided that @p should keep running. + */ + if (p->scx.flags & SCX_TASK_BAL_KEEP) { + p->scx.flags &= ~SCX_TASK_BAL_KEEP; + set_task_runnable(rq, p); + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + return; + } + + if (p->scx.flags & SCX_TASK_QUEUED) { + set_task_runnable(rq, p); + + /* + * If @p has slice left and balance_scx() didn't tag it for + * keeping, @p is getting preempted by a higher priority + * scheduler class or core-sched forcing a different task. Leave + * it at the head of the local DSQ. + */ + if (p->scx.slice && !scx_ops_bypassing()) { + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + return; + } + + /* + * If we're in the pick_next_task path, balance_scx() should + * have already populated the local DSQ if there are any other + * available tasks. If empty, tell ops.enqueue() that @p is the + * only one available for this cpu. ops.enqueue() should put it + * on the local DSQ so that the subsequent pick_next_task_scx() + * can find the task unless it wants to trigger a separate + * follow-up scheduling event. + */ + if (list_empty(&rq->scx.local_dsq.fifo)) + do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); + else + do_enqueue_task(rq, p, 0, -1); + } +} + +static struct task_struct *first_local_task(struct rq *rq) +{ + WARN_ON_ONCE(rb_first_cached(&rq->scx.local_dsq.priq)); + return list_first_entry_or_null(&rq->scx.local_dsq.fifo, + struct task_struct, scx.dsq_node.fifo); +} + +static struct task_struct *pick_next_task_scx(struct rq *rq) +{ + struct task_struct *p; + +#ifndef CONFIG_SMP + /* UP workaround - see the comment at the head of put_prev_task_scx() */ + if (unlikely(rq->curr->sched_class != &ext_sched_class)) + balance_scx(rq, rq->curr, NULL); +#endif + + p = first_local_task(rq); + if (!p) + return NULL; + + if (unlikely(!p->scx.slice)) { + if (!scx_ops_bypassing() && !scx_warned_zero_slice) { + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", + p->comm, p->pid); + scx_warned_zero_slice = true; + } + p->scx.slice = SCX_SLICE_DFL; + } + + set_next_task_scx(rq, p, true); + + return p; +} + +#ifdef CONFIG_SCHED_CORE +/** + * scx_prio_less - Task ordering for core-sched + * @a: task A + * @b: task B + * + * Core-sched is implemented as an additional scheduling layer on top of the + * usual sched_class'es and needs to find out the expected task ordering. For + * SCX, core-sched calls this function to interrogate the task ordering. + * + * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used + * to implement the default task ordering. The older the timestamp, the higher + * prority the task - the global FIFO ordering matching the default scheduling + * behavior. + * + * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to + * implement FIFO ordering within each local DSQ. See pick_task_scx(). + */ +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + /* + * The const qualifiers are dropped from task_struct pointers when + * calling ops.core_sched_before(). Accesses are controlled by the + * verifier. + */ + if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + (struct task_struct *)a, + (struct task_struct *)b); + else + return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); +} + +/** + * pick_task_scx - Pick a candidate task for core-sched + * @rq: rq to pick the candidate task from + * + * Core-sched calls this function on each SMT sibling to determine the next + * tasks to run on the SMT siblings. balance_one() has been called on all + * siblings and put_prev_task_scx() has been called only for the current CPU. + * + * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look + * at the first task in the local dsq. @rq->curr has to be considered explicitly + * to mimic %SCX_TASK_BAL_KEEP. + */ +static struct task_struct *pick_task_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct task_struct *first = first_local_task(rq); + + if (curr->scx.flags & SCX_TASK_QUEUED) { + /* is curr the only runnable task? */ + if (!first) + return curr; + + /* + * Does curr trump first? We can always go by core_sched_at for + * this comparison as it represents global FIFO ordering when + * the default core-sched ordering is used and local-DSQ FIFO + * ordering otherwise. + * + * We can have a task with an earlier timestamp on the DSQ. For + * example, when a current task is preempted by a sibling + * picking a different cookie, the task would be requeued at the + * head of the local DSQ with an earlier timestamp than the + * core-sched picked next task. Besides, the BPF scheduler may + * dispatch any tasks to the local DSQ anytime. + */ + if (curr->scx.slice && time_before64(curr->scx.core_sched_at, + first->scx.core_sched_at)) + return curr; + } + + return first; /* this may be %NULL */ +} +#endif /* CONFIG_SCHED_CORE */ + +static enum scx_cpu_preempt_reason +preempt_reason_from_class(const struct sched_class *class) +{ +#ifdef CONFIG_SMP + if (class == &stop_sched_class) + return SCX_CPU_PREEMPT_STOP; +#endif + if (class == &dl_sched_class) + return SCX_CPU_PREEMPT_DL; + if (class == &rt_sched_class) + return SCX_CPU_PREEMPT_RT; + return SCX_CPU_PREEMPT_UNKNOWN; +} + +void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task, + const struct sched_class *active) +{ + lockdep_assert_rq_held(rq); + + /* + * The callback is conceptually meant to convey that the CPU is no + * longer under the control of SCX. Therefore, don't invoke the + * callback if the CPU is is staying on SCX, or going idle (in which + * case the SCX scheduler has actively decided not to schedule any + * tasks on the CPU). + */ + if (likely(active >= &ext_sched_class)) + return; + + /* + * At this point we know that SCX was preempted by a higher priority + * sched_class, so invoke the ->cpu_release() callback if we have not + * done so already. We only send the callback once between SCX being + * preempted, and it regaining control of the CPU. + * + * ->cpu_release() complements ->cpu_acquire(), which is emitted the + * next time that balance_scx() is invoked. + */ + if (!rq->scx.cpu_released) { + if (SCX_HAS_OP(cpu_release)) { + struct scx_cpu_release_args args = { + .reason = preempt_reason_from_class(active), + .task = task, + }; + + SCX_CALL_OP(SCX_KF_CPU_RELEASE, + cpu_release, cpu_of(rq), &args); + } + rq->scx.cpu_released = true; + } +} + +#ifdef CONFIG_SMP + +static bool test_and_clear_cpu_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from idle_masks.smt. Ensure that @cpu + * is eventually cleared. + */ + if (cpumask_intersects(smt, idle_masks.smt)) + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + else if (cpumask_test_cpu(cpu, idle_masks.smt)) + __cpumask_clear_cpu(cpu, idle_masks.smt); + } +#endif + return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); +} + +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (test_and_clear_cpu_idle(cpu)) + return cpu; + else + goto retry; +} + +static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *found) +{ + s32 cpu; + + *found = false; + + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return prev_cpu; + } + + /* + * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the + * local DSQ of the waker. + */ + if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && + !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING)) { + cpu = smp_processor_id(); + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto cpu_found; + } + + if (p->nr_cpus_allowed == 1) { + if (test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } else { + return prev_cpu; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && + test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + if (test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto cpu_found; + + return prev_cpu; + +cpu_found: + *found = true; + return cpu; +} + +__bpf_kfunc_start_defs(); + +__bpf_kfunc +static s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *found) +{ + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { + *found = false; + return prev_cpu; + } + + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, found); +} + +__bpf_kfunc_end_defs(); + +static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) +{ + if (SCX_HAS_OP(select_cpu)) { + s32 cpu; + struct task_struct **ddsp_taskp; + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, p, prev_cpu, wake_flags); + *ddsp_taskp = NULL; + if (ops_cpu_valid(cpu)) { + return cpu; + } else { + scx_ops_error("select_cpu returned invalid cpu %d", cpu); + return prev_cpu; + } + } else { + bool found; + s32 cpu; + + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); + if (found) { + p->scx.slice = SCX_SLICE_DFL; + p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + } + return cpu; + } +} + +static void set_cpus_allowed_scx(struct task_struct *p, + struct affinity_context *ac) +{ + set_cpus_allowed_common(p, ac); + + /* + * The effective cpumask is stored in @p->cpus_ptr which may temporarily + * differ from the configured one in @p->cpus_mask. Always tell the bpf + * scheduler the effective one. + * + * Fine-grained memory write control is enforced by BPF making the const + * designation pointless. Cast it away when calling the operation. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void reset_idle_masks(void) +{ + /* consider all cpus idle, should converge to the actual state quickly */ + cpumask_setall(idle_masks.cpu); + cpumask_setall(idle_masks.smt); +} + +void __scx_update_idle(struct rq *rq, bool idle) +{ + int cpu = cpu_of(rq); + + if (SCX_HAS_OP(update_idle)) { + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + if (!static_branch_unlikely(&scx_builtin_idle_enabled)) + return; + } + + if (idle) + cpumask_set_cpu(cpu, idle_masks.cpu); + else + cpumask_clear_cpu(cpu, idle_masks.cpu); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + if (idle) { + /* + * idle_masks.smt handling is racy but that's fine as + * it's only for optimization and self-correcting. + */ + for_each_cpu(cpu, smt) { + if (!cpumask_test_cpu(cpu, idle_masks.cpu)) + return; + } + cpumask_or(idle_masks.smt, idle_masks.smt, smt); + } else { + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + } + } +#endif +} + +static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG) + SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq)); +} + +static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason) +{ + if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG) + SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq)); +} + +#else /* !CONFIG_SMP */ + +static bool test_and_clear_cpu_idle(int cpu) { return false; } +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } +static void reset_idle_masks(void) {} + +#endif /* CONFIG_SMP */ + +static bool check_rq_for_timeouts(struct rq *rq) +{ + struct task_struct *p; + struct rq_flags rf; + bool timed_out = false; + + rq_lock_irqsave(rq, &rf); + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + unsigned long last_runnable = p->scx.runnable_at; + + if (unlikely(time_after(jiffies, + last_runnable + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, + dur_ms / 1000, dur_ms % 1000); + timed_out = true; + break; + } + } + rq_unlock_irqrestore(rq, &rf); + + return timed_out; +} + +static void scx_watchdog_workfn(struct work_struct *work) +{ + int cpu; + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + + for_each_online_cpu(cpu) { + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + break; + + cond_resched(); + } + queue_delayed_work(system_unbound_wq, to_delayed_work(work), + scx_watchdog_timeout / 2); +} + +static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +{ + update_curr_scx(rq); + + /* + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). + */ + if (scx_ops_bypassing()) { + curr->scx.slice = 0; + touch_core_sched(rq, curr); + } + + if (!curr->scx.slice) + resched_curr(rq); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static struct cgroup *tg_cgrp(struct task_group *tg) +{ + /* + * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, + * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the + * root cgroup. + */ + if (tg && tg->css.cgroup) + return tg->css.cgroup; + else + return &cgrp_dfl_root.cgrp; +} + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), + +#else /* CONFIG_EXT_GROUP_SCHED */ + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +static enum scx_task_state scx_get_task_state(const struct task_struct *p) +{ + return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; +} + +static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +{ + enum scx_task_state prev_state = scx_get_task_state(p); + + BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); + + switch (state) { + case SCX_TASK_NONE: + break; + case SCX_TASK_INIT: + WARN_ON_ONCE(prev_state != SCX_TASK_NONE); + break; + case SCX_TASK_READY: + WARN_ON_ONCE(prev_state == SCX_TASK_NONE); + break; + case SCX_TASK_ENABLED: + WARN_ON_ONCE(prev_state != SCX_TASK_READY); + break; + default: + WARN_ON_ONCE(true); + return; + } + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state << SCX_TASK_STATE_SHIFT; +} + +static int scx_ops_init_task(struct task_struct *p, struct task_group *tg) +{ + int ret; + + p->scx.disallow = false; + + if (SCX_HAS_OP(init_task)) { + struct scx_init_task_args args = { + SCX_INIT_TASK_ARGS_CGROUP(tg) + }; + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); + if (unlikely(ret)) { + ret = ops_sanitize_err("init_task", ret); + return ret; + } + } + + scx_set_task_state(p, SCX_TASK_INIT); + + if (p->scx.disallow) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + + /* + * We're either in fork or load path and @p->policy will be + * applied right after. Reverting @p->policy here and rejecting + * %SCHED_EXT transitions from scx_check_setscheduler() + * guarantees that if ops.init_task() sets @p->disallow, @p can + * never be in SCX. + */ + if (p->policy == SCHED_EXT) { + p->policy = SCHED_NORMAL; + atomic_long_inc(&scx_nr_rejected); + } + + task_rq_unlock(rq, p, &rf); + } + + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + return 0; +} + +static void set_task_scx_weight(struct task_struct *p) +{ + u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; + + p->scx.weight = sched_weight_to_cgroup(weight); +} + +static void scx_ops_enable_task(struct task_struct *p) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* + * Set the weight before calling ops.enable() so that the scheduler + * doesn't see a stale value if they inspect the task struct. + */ + set_task_scx_weight(p); + if (SCX_HAS_OP(enable)) + SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + scx_set_task_state(p, SCX_TASK_ENABLED); + + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void scx_ops_disable_task(struct task_struct *p) +{ + lockdep_assert_rq_held(task_rq(p)); + WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + + if (SCX_HAS_OP(disable)) + SCX_CALL_OP(SCX_KF_REST, disable, p); + scx_set_task_state(p, SCX_TASK_READY); +} + +static void scx_ops_exit_task(struct task_struct *p) +{ + struct scx_exit_task_args args = { + .cancelled = false, + }; + + lockdep_assert_rq_held(task_rq(p)); + switch (scx_get_task_state(p)) { + case SCX_TASK_NONE: + return; + case SCX_TASK_INIT: + args.cancelled = true; + break; + case SCX_TASK_READY: + break; + case SCX_TASK_ENABLED: + scx_ops_disable_task(p); + break; + default: + WARN_ON_ONCE(true); + return; + } + + if (SCX_HAS_OP(exit_task)) + SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + scx_set_task_state(p, SCX_TASK_NONE); +} + +void scx_pre_fork(struct task_struct *p) +{ + /* + * BPF scheduler enable/disable paths want to be able to iterate and + * update all tasks which can become complex when racing forks. As + * enable/disable are very cold paths, let's use a percpu_rwsem to + * exclude forks. + */ + percpu_down_read(&scx_fork_rwsem); +} + +int scx_fork(struct task_struct *p) +{ + percpu_rwsem_assert_held(&scx_fork_rwsem); + + if (scx_enabled()) + return scx_ops_init_task(p, task_group(p)); + else + return 0; +} + +void scx_post_fork(struct task_struct *p) +{ + if (scx_enabled()) { + scx_set_task_state(p, SCX_TASK_READY); + /* + * Enable the task immediately if it's running on sched_ext. + * Otherwise, it'll be enabled in switching_to_scx() if and + * when it's ever configured to run with a SCHED_EXT policy. + */ + if (p->sched_class == &ext_sched_class) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_enable_task(p); + task_rq_unlock(rq, p, &rf); + } + } + + spin_lock_irq(&scx_tasks_lock); + list_add_tail(&p->scx.tasks_node, &scx_tasks); + spin_unlock_irq(&scx_tasks_lock); + + percpu_up_read(&scx_fork_rwsem); +} + +void scx_cancel_fork(struct task_struct *p) +{ + if (scx_enabled()) { + WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); + scx_ops_exit_task(p); + } + percpu_up_read(&scx_fork_rwsem); +} + +void sched_ext_free(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&scx_tasks_lock, flags); + list_del_init(&p->scx.tasks_node); + spin_unlock_irqrestore(&scx_tasks_lock, flags); + + /* + * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> + * ENABLED transitions can't race us. Disable ops for @p. + */ + if (scx_get_task_state(p) != SCX_TASK_NONE) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_exit_task(p); + task_rq_unlock(rq, p, &rf); + } +} + +static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) +{ + lockdep_assert_rq_held(task_rq(p)); + + set_task_scx_weight(p); + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +{ +} + +static void switching_to_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_enable_task(p); + + /* + * set_cpus_allowed_scx() is not called while @p is associated with a + * different scheduler class. Keep the BPF scheduler up-to-date. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void switched_from_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_disable_task(p); +} + +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void switched_to_scx(struct rq *rq, struct task_struct *p) {} + +int scx_check_setscheduler(struct task_struct *p, int policy) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* if disallow, reject transitioning into SCX */ + if (scx_enabled() && READ_ONCE(p->scx.disallow) && + p->policy != policy && policy == SCHED_EXT) + return -EACCES; + + return 0; +} + +#ifdef CONFIG_NO_HZ_FULL +bool scx_can_stop_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (scx_ops_bypassing()) + return false; + + if (p->sched_class != &ext_sched_class) + return true; + + /* + * @rq can dispatch from different DSQs, so we can't tell whether it + * needs the tick or not by looking at nr_running. Allow stopping ticks + * iff the BPF scheduler indicated so. See set_next_task_scx(). + */ + return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; +} +#endif + +#ifdef CONFIG_EXT_GROUP_SCHED + +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); + +int scx_tg_online(struct task_group *tg) +{ + int ret = 0; + + WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, + tg->css.cgroup, &args); + if (!ret) + tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + else + ret = ops_sanitize_err("cgroup_init", ret); + } else { + tg->scx_flags |= SCX_TG_ONLINE; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ret; +} + +void scx_tg_offline(struct task_group *tg) +{ + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup); + tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + + percpu_up_read(&scx_cgroup_rwsem); +} + +int scx_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + int ret; + + /* released in scx_finish/cancel_attach() */ + percpu_down_read(&scx_cgroup_rwsem); + + if (!scx_enabled()) + return 0; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup *from = tg_cgrp(task_group(p)); + + if (SCX_HAS_OP(cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move, + p, from, css->cgroup); + if (ret) + goto err; + } + + WARN_ON_ONCE(p->scx.cgrp_moving_from); + p->scx.cgrp_moving_from = from; + } + + return 0; + +err: + cgroup_taskset_for_each(p, css, tset) { + if (!p->scx.cgrp_moving_from) + break; + if (SCX_HAS_OP(cgroup_cancel_move)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ops_sanitize_err("cgroup_prep_move", ret); +} + +void scx_move_task(struct task_struct *p) +{ + /* + * We're called from sched_move_task() which handles both cgroup and + * autogroup moves. Ignore the latter. + * + * Also ignore exiting tasks, because in the exit path tasks transition + * from the autogroup to the root group, so task_group_is_autogroup() + * alone isn't able to catch exiting autogroup tasks. This is safe for + * cgroup_move(), because cgroup migrations never happen for PF_EXITING + * tasks. + */ + if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p))) + return; + + if (!scx_enabled()) + return; + + if (SCX_HAS_OP(cgroup_move)) { + if (WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + return; + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, + p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + } + p->scx.cgrp_moving_from = NULL; +} + +void scx_cgroup_finish_attach(void) +{ + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + + if (!scx_enabled()) + goto out_unlock; + + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move)) { + WARN_ON_ONCE(!p->scx.cgrp_moving_from); + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + } + p->scx.cgrp_moving_from = NULL; + } +out_unlock: + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_weight(struct task_group *tg, unsigned long weight) +{ + percpu_down_read(&scx_cgroup_rwsem); + + if (tg->scx_weight != weight) { + if (SCX_HAS_OP(cgroup_set_weight)) + SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight, + tg_cgrp(tg), weight); + tg->scx_weight = weight; + } + + percpu_up_read(&scx_cgroup_rwsem); +} + +static void scx_cgroup_lock(void) +{ + percpu_down_write(&scx_cgroup_rwsem); +} + +static void scx_cgroup_unlock(void) +{ + percpu_up_write(&scx_cgroup_rwsem); +} + +#else /* CONFIG_EXT_GROUP_SCHED */ + +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +/* + * Omitted operations: + * + * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task + * isn't tied to the CPU at that point. Preemption is implemented by resetting + * the victim task's slice to 0 and triggering reschedule on the target CPU. + * + * - migrate_task_rq: Unncessary as task to cpu mapping is transient. + * + * - task_fork/dead: We need fork/dead notifications for all tasks regardless of + * their current sched_class. Call them directly from sched core instead. + * + * - task_woken: Unnecessary. + */ +DEFINE_SCHED_CLASS(ext) = { + .enqueue_task = enqueue_task_scx, + .dequeue_task = dequeue_task_scx, + .yield_task = yield_task_scx, + .yield_to_task = yield_to_task_scx, + + .wakeup_preempt = wakeup_preempt_scx, + + .pick_next_task = pick_next_task_scx, + + .put_prev_task = put_prev_task_scx, + .set_next_task = set_next_task_scx, + +#ifdef CONFIG_SMP + .balance = balance_scx, + .select_task_rq = select_task_rq_scx, + .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, +#endif + +#ifdef CONFIG_SCHED_CORE + .pick_task = pick_task_scx, +#endif + + .task_tick = task_tick_scx, + + .switching_to = switching_to_scx, + .switched_from = switched_from_scx, + .switched_to = switched_to_scx, + .reweight_task = reweight_task_scx, + .prio_changed = prio_changed_scx, + + .update_curr = update_curr_scx, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 0, +#endif +}; + +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +{ + memset(dsq, 0, sizeof(*dsq)); + + raw_spin_lock_init(&dsq->lock); + INIT_LIST_HEAD(&dsq->fifo); + dsq->id = dsq_id; +} + +static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) +{ + struct scx_dispatch_q *dsq; + int ret; + + if (dsq_id & SCX_DSQ_FLAG_BUILTIN) + return ERR_PTR(-EINVAL); + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return ERR_PTR(-ENOMEM); + + init_dsq(dsq, dsq_id); + + ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, + dsq_hash_params); + if (ret) { + kfree(dsq); + return ERR_PTR(ret); + } + return dsq; +} + +static void free_dsq_irq_workfn(struct irq_work *irq_work) +{ + struct llist_node *to_free = llist_del_all(&dsqs_to_free); + struct scx_dispatch_q *dsq, *tmp_dsq; + + llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) + kfree_rcu(dsq, rcu); +} + +static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); + +static void destroy_dsq(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + unsigned long flags; + + rcu_read_lock(); + + dsq = rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); + if (!dsq) + goto out_unlock_rcu; + + raw_spin_lock_irqsave(&dsq->lock, flags); + + if (dsq->nr) { + scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); + goto out_unlock_dsq; + } + + if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) + goto out_unlock_dsq; + + /* + * Mark dead by invalidating ->id to prevent dispatch_enqueue() from + * queueing more tasks. As this function can be called from anywhere, + * freeing is bounced through an irq work to avoid nesting RCU + * operations inside scheduler locks. + */ + dsq->id = SCX_DSQ_INVALID; + llist_add(&dsq->free_node, &dsqs_to_free); + irq_work_queue(&free_dsq_irq_work); + +out_unlock_dsq: + raw_spin_unlock_irqrestore(&dsq->lock, flags); +out_unlock_rcu: + rcu_read_unlock(); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static void scx_cgroup_exit(void) +{ + struct cgroup_subsys_state *css; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and exit all the inited ones, all online cgroups are exited. + */ + rcu_read_lock(); + css_for_each_descendant_post(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + + if (!(tg->scx_flags & SCX_TG_INITED)) + continue; + tg->scx_flags &= ~SCX_TG_INITED; + + if (!scx_ops.cgroup_exit) + continue; + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); +} + +static int scx_cgroup_init(void) +{ + struct cgroup_subsys_state *css; + int ret; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * cgroups and init, all online cgroups are initialized. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + if ((tg->scx_flags & + (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) + continue; + + if (!scx_ops.cgroup_init) { + tg->scx_flags |= SCX_TG_INITED; + continue; + } + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, + css->cgroup, &args); + if (ret) { + css_put(css); + return ret; + } + tg->scx_flags |= SCX_TG_INITED; + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); + + return 0; +} + +static void scx_cgroup_config_knobs(void) +{ + static DEFINE_MUTEX(cgintf_mutex); + DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { }; + u64 knob_flags; + int i; + + /* + * Called from both class switch and ops enable/disable paths, + * synchronize internally. + */ + mutex_lock(&cgintf_mutex); + + /* if fair is in use, all knobs should be shown */ + if (!scx_switched_all()) { + bitmap_fill(mask, CPU_CFTYPE_CNT); + goto apply; + } + + /* + * On ext, only show the supported knobs. Otherwise, show all possible + * knobs so that configuration attempts succeed and the states are + * remembered while ops is not loaded. + */ + if (scx_enabled()) + knob_flags = scx_ops.flags; + else + knob_flags = SCX_OPS_ALL_FLAGS; + + if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) { + __set_bit(CPU_CFTYPE_WEIGHT, mask); + __set_bit(CPU_CFTYPE_WEIGHT_NICE, mask); + } +apply: + for (i = 0; i < CPU_CFTYPE_CNT; i++) + cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask)); + + mutex_unlock(&cgintf_mutex); +} + +#else +static void scx_cgroup_exit(void) {} +static int scx_cgroup_init(void) { return 0; } +static void scx_cgroup_config_knobs(void) {} +#endif + + +/******************************************************************************** + * Sysfs interface and ops enable/disable. + */ + +#define SCX_ATTR(_name) \ + static struct kobj_attribute scx_attr_##_name = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = scx_attr_##_name##_show, \ + } + +static ssize_t scx_attr_state_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", + scx_ops_enable_state_str[scx_ops_enable_state()]); +} +SCX_ATTR(state); + +static ssize_t scx_attr_switch_all_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); +} +SCX_ATTR(switch_all); + +static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); +} +SCX_ATTR(nr_rejected); + +static struct attribute *scx_global_attrs[] = { + &scx_attr_state.attr, + &scx_attr_switch_all.attr, + &scx_attr_nr_rejected.attr, + NULL, +}; + +static const struct attribute_group scx_global_attr_group = { + .attrs = scx_global_attrs, +}; + +static void scx_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static ssize_t scx_attr_ops_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", scx_ops.name); +} +SCX_ATTR(ops); + +static struct attribute *scx_sched_attrs[] = { + &scx_attr_ops.attr, + NULL, +}; +ATTRIBUTE_GROUPS(scx_sched); + +static const struct kobj_type scx_ktype = { + .release = scx_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = scx_sched_groups, +}; + +static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); +} + +static const struct kset_uevent_ops scx_uevent_ops = { + .uevent = scx_uevent, +}; + +/* + * Used by sched_fork() and __setscheduler_prio() to pick the matching + * sched_class. dl/rt are already handled. + */ +bool task_should_scx(struct task_struct *p) +{ + if (!scx_enabled() || + unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) + return false; + if (READ_ONCE(scx_switching_all)) + return true; + return p->policy == SCHED_EXT; +} + +/** + * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * + * Bypassing guarantees that all runnable tasks make forward progress without + * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might + * be held by tasks that the BPF scheduler is forgetting to run, which + * unfortunately also excludes toggling the static branches. + * + * Let's work around by overriding a couple ops and modifying behaviors based on + * the DISABLING state and then cycling the queued tasks through dequeue/enqueue + * to force global FIFO scheduling. + * + * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * + * b. ops.dispatch() is ignored. + * + * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be + * trusted. Whenever a tick triggers, the running task is rotated to the tail + * of the queue with core_sched_at touched. + * + * d. pick_next_task() suppresses zero slice warning. + * + * e. scx_prio_less() reverts to the default core_sched_at order. + * + * f. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + */ +static void scx_ops_bypass(bool bypass) +{ + int depth, cpu; + + if (bypass) { + depth = atomic_inc_return(&scx_ops_bypass_depth); + WARN_ON_ONCE(depth <= 0); + if (depth != 1) + return; + } else { + depth = atomic_dec_return(&scx_ops_bypass_depth); + WARN_ON_ONCE(depth < 0); + if (depth != 0) + return; + } + + /* + * No task property is changing. We just need to make sure all currently + * queued tasks are re-queued according to the new scx_ops_bypassing() + * state. As an optimization, walk each rq's runnable_list instead of + * the scx_tasks list. + * + * This function can't trust the scheduler and thus can't use + * cpus_read_lock(). Walk all possible CPUs instead of online. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p, *n; + + rq_lock_irqsave(rq, &rf); + + /* + * The use of list_for_each_entry_safe_reverse() is required + * because each task is going to be removed from and added back + * to the runnable_list during iteration. Because they're added + * to the tail of the list, safe reverse iteration can still + * visit all nodes. + */ + list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, + scx.runnable_node) { + struct sched_enq_and_set_ctx ctx; + + /* cycling deq/enq is enough, see the function comment */ + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + sched_enq_and_set_task(&ctx); + } + + rq_unlock_irqrestore(rq, &rf); + + /* kick to restore ticks */ + resched_cpu(cpu); + } +} + +static void scx_ops_disable_workfn(struct kthread_work *work) +{ + struct scx_exit_info *ei = &scx_exit_info; + struct scx_task_iter sti; + struct task_struct *p; + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + const char *reason; + int i, kind; + + kind = atomic_read(&scx_exit_kind); + while (true) { + /* + * NONE indicates that a new scx_ops has been registered since + * disable was scheduled - don't kill the new ops. DONE + * indicates that the ops has already been disabled. + */ + if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) + return; + if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) + break; + } + + cancel_delayed_work_sync(&scx_watchdog_work); + + switch (kind) { + case SCX_EXIT_UNREG: + reason = "BPF scheduler unregistered"; + break; + case SCX_EXIT_SYSRQ: + reason = "disabled by sysrq-S"; + break; + case SCX_EXIT_ERROR: + reason = "runtime error"; + break; + case SCX_EXIT_ERROR_BPF: + reason = "scx_bpf_error"; + break; + case SCX_EXIT_ERROR_STALL: + reason = "runnable task stall"; + break; + default: + reason = ""; + } + + ei->kind = kind; + strscpy_pad(ei->reason, reason, sizeof(ei->reason)); + + /* guarantee forward progress by bypassing scx_ops */ + scx_ops_bypass(true); + + switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { + case SCX_OPS_DISABLING: + WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); + break; + case SCX_OPS_DISABLED: + pr_warn("sched_ext: ops error detected without ops (%s)\n", + scx_exit_info.msg); + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); + goto done; + default: + break; + } + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_ops_enable_mutex); + + static_branch_disable(&__scx_switched_all); + WRITE_ONCE(scx_switching_all, false); + + /* + * Avoid racing against fork and cgroup changes. See scx_ops_enable() + * for explanation on the locking order. + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); + scx_cgroup_lock(); + + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + bool alive = READ_ONCE(p->__state) != TASK_DEAD; + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); + + __setscheduler_prio(p, p->prio); + if (alive) + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + if (alive) + check_class_changed(task_rq(p), p, old_class, p->prio); + + scx_ops_exit_task(p); + } + scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); + + /* no task is on scx, turn off all the switches and flush in-progress calls */ + static_branch_disable_cpuslocked(&__scx_ops_enabled); + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + static_branch_disable_cpuslocked(&scx_has_op[i]); + static_branch_disable_cpuslocked(&scx_ops_enq_last); + static_branch_disable_cpuslocked(&scx_ops_enq_exiting); + static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + synchronize_rcu(); + + scx_cgroup_exit(); + + scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + + if (ei->kind >= SCX_EXIT_ERROR) { + printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); + + if (ei->msg[0] == '\0') + printk(KERN_ERR "sched_ext: %s\n", ei->reason); + else + printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); + + stack_trace_print(ei->bt, ei->bt_len, 2); + } + + if (scx_ops.exit) + SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + + kobject_del(scx_root_kobj); + scx_root_kobj = NULL; + + memset(&scx_ops, 0, sizeof(scx_ops)); + + rhashtable_walk_enter(&dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + free_percpu(scx_dsp_buf); + scx_dsp_buf = NULL; + scx_dsp_max_batch = 0; + + mutex_unlock(&scx_ops_enable_mutex); + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); + + scx_cgroup_config_knobs(); +done: + scx_ops_bypass(false); +} + +static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); + +static void schedule_scx_ops_disable_work(void) +{ + struct kthread_worker *helper = READ_ONCE(scx_ops_helper); + + /* + * We may be called spuriously before the first bpf_sched_ext_reg(). If + * scx_ops_helper isn't set up yet, there's nothing to do. + */ + if (helper) + kthread_queue_work(helper, &scx_ops_disable_work); +} + +static void scx_ops_disable(enum scx_exit_kind kind) +{ + int none = SCX_EXIT_NONE; + + if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) + kind = SCX_EXIT_ERROR; + + atomic_try_cmpxchg(&scx_exit_kind, &none, kind); + + schedule_scx_ops_disable_work(); +} + +static void scx_ops_error_irq_workfn(struct irq_work *irq_work) +{ + schedule_scx_ops_disable_work(); +} + +static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); + +__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind, + const char *fmt, ...) +{ + struct scx_exit_info *ei = &scx_exit_info; + int none = SCX_EXIT_NONE; + va_list args; + + if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) + return; + + ei->bt_len = stack_trace_save(ei->bt, ARRAY_SIZE(ei->bt), 1); + + va_start(args, fmt); + vscnprintf(ei->msg, ARRAY_SIZE(ei->msg), fmt, args); + va_end(args); + + irq_work_queue(&scx_ops_error_irq_work); +} + +static struct kthread_worker *scx_create_rt_helper(const char *name) +{ + struct kthread_worker *helper; + + helper = kthread_create_worker(0, name); + if (helper) + sched_set_fifo(helper->task); + return helper; +} + +static int validate_ops(const struct sched_ext_ops *ops) +{ + /* + * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the + * ops.enqueue() callback isn't implemented. + */ + if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { + scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + return -EINVAL; + } + + return 0; +} + +static int scx_ops_enable(struct sched_ext_ops *ops) +{ + struct scx_task_iter sti; + struct task_struct *p; + unsigned long timeout; + int i, ret; + + mutex_lock(&scx_ops_enable_mutex); + + if (!scx_ops_helper) { + WRITE_ONCE(scx_ops_helper, + scx_create_rt_helper("sched_ext_ops_helper")); + if (!scx_ops_helper) { + ret = -ENOMEM; + goto err; + } + } + + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + ret = -EBUSY; + goto err; + } + + scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); + if (!scx_root_kobj) { + ret = -ENOMEM; + goto err; + } + + scx_root_kobj->kset = scx_kset; + ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err; + + /* + * Set scx_ops, transition to PREPPING and clear exit info to arm the + * disable path. Failure triggers full disabling from here on. + */ + scx_ops = *ops; + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != + SCX_OPS_DISABLED); + + memset(&scx_exit_info, 0, sizeof(scx_exit_info)); + atomic_set(&scx_exit_kind, SCX_EXIT_NONE); + scx_warned_zero_slice = false; + + atomic_long_set(&scx_nr_rejected, 0); + + /* + * Keep CPUs stable during enable so that the BPF scheduler can track + * online CPUs by watching ->on/offline_cpu() after ->init(). + */ + cpus_read_lock(); + + scx_switch_all_req = false; + if (scx_ops.init) { + ret = SCX_CALL_OP_RET(SCX_KF_INIT, init); + if (ret) { + ret = ops_sanitize_err("init", ret); + goto err_disable_unlock_cpus; + } + + /* + * Exit early if ops.init() triggered scx_bpf_error(). Not + * strictly necessary as we'll fail transitioning into ENABLING + * later but that'd be after calling ops.init_task() on all + * tasks and with -EBUSY which isn't very intuitive. Let's exit + * early with success so that the condition is notified through + * ops.exit() like other scx_bpf_error() invocations. + */ + if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE) + goto err_disable_unlock_cpus; + } + + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + + cpus_read_unlock(); + + ret = validate_ops(ops); + if (ret) + goto err_disable; + + WARN_ON_ONCE(scx_dsp_buf); + scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; + scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch, + __alignof__(scx_dsp_buf[0])); + if (!scx_dsp_buf) { + ret = -ENOMEM; + goto err_disable; + } + + if (ops->timeout_ms) + timeout = msecs_to_jiffies(ops->timeout_ms); + else + timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + WRITE_ONCE(scx_watchdog_timeout, timeout); + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + scx_watchdog_timeout / 2); + + /* + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. + * + * We don't need to keep the CPUs stable but static_branch_*() requires + * cpus_read_lock() and scx_cgroup_rwsem must nest inside + * cpu_hotplug_lock because of the following dependency chain: + * + * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem + * + * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use + * static_branch_*_cpuslocked(). + * + * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the + * following dependency chain: + * + * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock + */ + percpu_down_write(&scx_fork_rwsem); + cpus_read_lock(); + scx_cgroup_lock(); + + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + + if (ops->flags & SCX_OPS_ENQ_LAST) + static_branch_enable_cpuslocked(&scx_ops_enq_last); + + if (ops->flags & SCX_OPS_ENQ_EXITING) + static_branch_enable_cpuslocked(&scx_ops_enq_exiting); + if (scx_ops.cpu_acquire || scx_ops.cpu_release) + static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); + + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + reset_idle_masks(); + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + } else { + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + } + + /* + * All cgroups should be initialized before letting in tasks. cgroup + * on/offlining and task migrations are already locked out. + */ + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; + + static_branch_enable_cpuslocked(&__scx_ops_enabled); + + /* + * Enable ops for every task. Fork is excluded by scx_fork_rwsem + * preventing new tasks from being added. No need to exclude tasks + * leaving as sched_ext_free() can handle both prepped and enabled + * tasks. Prep all tasks first and then enable them with preemption + * disabled. + */ + spin_lock_irq(&scx_tasks_lock); + + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered(&sti))) { + get_task_struct(p); + spin_unlock_irq(&scx_tasks_lock); + + ret = scx_ops_init_task(p, task_group(p)); + if (ret) { + put_task_struct(p); + spin_lock_irq(&scx_tasks_lock); + scx_task_iter_exit(&sti); + spin_unlock_irq(&scx_tasks_lock); + pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", + ret, p->comm, p->pid); + goto err_disable_unlock_all; + } + + put_task_struct(p); + spin_lock_irq(&scx_tasks_lock); + } + scx_task_iter_exit(&sti); + + /* + * All tasks are prepped but are still ops-disabled. Ensure that + * %current can't be scheduled out and switch everyone. + * preempt_disable() is necessary because we can't guarantee that + * %current won't be starved if scheduled out while switching. + */ + preempt_disable(); + + /* + * From here on, the disable path must assume that tasks have ops + * enabled and need to be recovered. + */ + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { + preempt_enable(); + spin_unlock_irq(&scx_tasks_lock); + ret = -EBUSY; + goto err_disable_unlock_all; + } + + /* + * We're fully committed and can't fail. The PREPPED -> ENABLED + * transitions here are synchronized against sched_ext_free() through + * scx_tasks_lock. + */ + WRITE_ONCE(scx_switching_all, scx_switch_all_req); + + scx_task_iter_init(&sti); + while ((p = scx_task_iter_next_filtered_locked(&sti))) { + if (READ_ONCE(p->__state) != TASK_DEAD) { + const struct sched_class *old_class = p->sched_class; + struct sched_enq_and_set_ctx ctx; + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, + &ctx); + + scx_set_task_state(p, SCX_TASK_READY); + __setscheduler_prio(p, p->prio); + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + } else { + scx_ops_exit_task(p); + } + } + scx_task_iter_exit(&sti); + + spin_unlock_irq(&scx_tasks_lock); + preempt_enable(); + scx_cgroup_unlock(); + cpus_read_unlock(); + percpu_up_write(&scx_fork_rwsem); + + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { + ret = -EBUSY; + goto err_disable; + } + + if (scx_switch_all_req) + static_branch_enable(&__scx_switched_all); + + kobject_uevent(scx_root_kobj, KOBJ_ADD); + mutex_unlock(&scx_ops_enable_mutex); + + scx_cgroup_config_knobs(); + + return 0; + +err: + kfree(scx_root_kobj); + scx_root_kobj = NULL; + mutex_unlock(&scx_ops_enable_mutex); + return ret; + +err_disable_unlock_all: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable_unlock_cpus: + cpus_read_unlock(); +err_disable: + mutex_unlock(&scx_ops_enable_mutex); + /* must be fully disabled before returning */ + scx_ops_disable(SCX_EXIT_ERROR); + kthread_flush_work(&scx_ops_disable_work); + return ret; +} + + +/******************************************************************************** + * bpf_struct_ops plumbing. + */ +#include +#include + +extern struct btf *btf_vmlinux; +static const struct btf_type *task_struct_type; +static u32 task_struct_type_id; + +/* Make the 2nd argument of .dispatch a pointer that can be NULL. */ +static bool promote_dispatch_2nd_arg(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const struct bpf_struct_ops *st_ops; + const struct btf_member *member; + const struct btf_type *t; + u32 btf_id, member_idx; + const char *mname; + + /* btf_id should be the type id of struct sched_ext_ops */ + btf_id = prog->aux->attach_btf_id; + st_ops = bpf_struct_ops_find(btf_id); + if (!st_ops) + return false; + + /* BTF type of struct sched_ext_ops */ + t = st_ops->type; + + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(t)) + return false; + + /* + * Get the member name of this struct_ops program, which corresponds to + * a field in struct sched_ext_ops. For example, the member name of the + * dispatch struct_ops program (callback) is "dispatch". + */ + member = &btf_type_member(t)[member_idx]; + mname = btf_name_by_offset(btf_vmlinux, member->name_off); + + /* + * Check if it is the second argument of the function pointer at + * "dispatch" in struct sched_ext_ops. The arguments of struct_ops + * operators are sequential and 64-bit, so the second argument is at + * offset sizeof(__u64). + */ + if (strcmp(mname, "dispatch") == 0 && + off == sizeof(__u64)) { + /* + * The value is a pointer to a type (struct task_struct) given + * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), + * however, can be a NULL (PTR_MAYBE_NULL). The BPF program + * should check the pointer to make sure it is not NULL before + * using it, or the verifier will reject the program. + * + * Longer term, this is something that should be addressed by + * BTF, and be fully contained within the verifier. + */ + info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | + PTR_TRUSTED; + info->btf = btf_vmlinux; + info->btf_id = task_struct_type_id; + + return true; + } + + return false; +} + +static bool bpf_scx_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (promote_dispatch_2nd_arg(off, size, type, prog, info)) + return true; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t; + + t = btf_type_by_id(reg->btf, reg->btf_id); + if (t == task_struct_type) { + if (off >= offsetof(struct task_struct, scx.slice) && + off + size <= offsetofend(struct task_struct, scx.slice)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.disallow) && + off + size <= offsetofend(struct task_struct, scx.disallow)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_func_proto * +bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +const struct bpf_verifier_ops bpf_scx_verifier_ops = { + .get_func_proto = bpf_scx_get_func_proto, + .is_valid_access = bpf_scx_is_valid_access, + .btf_struct_access = bpf_scx_btf_struct_access, +}; + +static int bpf_scx_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct sched_ext_ops *uops = udata; + struct sched_ext_ops *ops = kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + int ret; + + switch (moff) { + case offsetof(struct sched_ext_ops, dispatch_max_batch): + if (*(u32 *)(udata + moff) > INT_MAX) + return -E2BIG; + ops->dispatch_max_batch = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, flags): + if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) + return -EINVAL; + ops->flags = *(u64 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, name): + ret = bpf_obj_name_cpy(ops->name, uops->name, + sizeof(ops->name)); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + return 1; + case offsetof(struct sched_ext_ops, timeout_ms): + if (*(u32 *)(udata + moff) > SCX_WATCHDOG_MAX_TIMEOUT) + return -E2BIG; + ops->timeout_ms = *(u32 *)(udata + moff); + return 1; + } + + return 0; +} + +static int bpf_scx_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct sched_ext_ops, init_task): +#ifdef CONFIG_EXT_GROUP_SCHED + case offsetof(struct sched_ext_ops, cgroup_init): + case offsetof(struct sched_ext_ops, cgroup_exit): + case offsetof(struct sched_ext_ops, cgroup_prep_move): +#endif + case offsetof(struct sched_ext_ops, init): + case offsetof(struct sched_ext_ops, exit): + break; + default: + if (prog->aux->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_scx_reg(void *kdata) +{ + return scx_ops_enable(kdata); +} + +static void bpf_scx_unreg(void *kdata) +{ + scx_ops_disable(SCX_EXIT_UNREG); + kthread_flush_work(&scx_ops_disable_work); +} + +static int bpf_scx_init(struct btf *btf) +{ + u32 type_id; + + type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + task_struct_type = btf_type_by_id(btf, type_id); + task_struct_type_id = type_id; + + return 0; +} + +static int bpf_scx_update(void *kdata, void *old_kdata) +{ + /* + * sched_ext does not support updating the actively-loaded BPF + * scheduler, as registering a BPF scheduler can always fail if the + * scheduler returns an error code for e.g. ops.init(), + * ops.init_task(), etc. Similarly, we can always race with + * unregistration happening elsewhere, such as with sysrq. + */ + return -EOPNOTSUPP; +} + +static int bpf_scx_validate(void *kdata) +{ + return 0; +} + +/* "extern" to avoid sparse warning, only used in this file */ +extern struct bpf_struct_ops bpf_sched_ext_ops; + +static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + return -EINVAL; +} + +static void enqueue_stub(struct task_struct *p, u64 enq_flags) +{} + +static void dequeue_stub(struct task_struct *p, u64 enq_flags) +{} + +static void dispatch_stub(s32 prev_cpu, struct task_struct *p) +{} + +static void runnable_stub(struct task_struct *p, u64 enq_flags) +{} + +static void running_stub(struct task_struct *p) +{} + +static void stopping_stub(struct task_struct *p, bool runnable) +{} + +static void quiescent_stub(struct task_struct *p, u64 deq_flags) +{} + +static bool yield_stub(struct task_struct *from, struct task_struct *to) +{ + return false; +} + +static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) +{ + return false; +} + +static void set_weight_stub(struct task_struct *p, u32 weight) +{} + +static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) +{} + +static void update_idle_stub(s32 cpu, bool idle) +{} + +static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) +{} + +static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) +{} + +static s32 init_task_stub(struct task_struct *p, + struct scx_init_task_args *args) +{ + return -EINVAL; +} + + +static void exit_task_stub(struct task_struct *p, + struct scx_exit_task_args *args) +{} + +static void enable_stub(struct task_struct *p) +{} + +static void disable_stub(struct task_struct *p) +{} + +#ifdef CONFIG_EXT_GROUP_SCHED +static s32 cgroup_init_stub(struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + return -EINVAL; +} + +static void cgroup_exit_stub(struct cgroup *cgrp) +{} + +static s32 cgroup_prep_move_stub(struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + return -EINVAL; +} + +static void cgroup_move_stub(struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +static void cgroup_cancel_move_stub(struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{} + +static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) +{} +#endif + +static void cpu_online_stub(s32 cpu) +{} + +static void cpu_offline_stub(s32 cpu) +{} + +static s32 init_stub(void) +{ + return -EINVAL; +} + +static void exit_stub(struct scx_exit_info *info) +{} + +static struct sched_ext_ops __bpf_ops_sched_ext_ops = { + .select_cpu = select_cpu_stub, + .enqueue = enqueue_stub, + .dequeue = dequeue_stub, + .dispatch = dispatch_stub, + .runnable = runnable_stub, + .running = running_stub, + .stopping = stopping_stub, + .quiescent = quiescent_stub, + .yield = yield_stub, + .core_sched_before = core_sched_before_stub, + .set_weight = set_weight_stub, + .set_cpumask = set_cpumask_stub, + .update_idle = update_idle_stub, + .cpu_acquire = cpu_acquire_stub, + .cpu_release = cpu_release_stub, + .init_task = init_task_stub, + .exit_task = exit_task_stub, + .enable = enable_stub, + .disable = disable_stub, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = cgroup_init_stub, + .cgroup_exit = cgroup_exit_stub, + .cgroup_prep_move = cgroup_prep_move_stub, + .cgroup_move = cgroup_move_stub, + .cgroup_cancel_move = cgroup_cancel_move_stub, + .cgroup_set_weight = cgroup_set_weight_stub, +#endif + .cpu_online = cpu_online_stub, + .cpu_offline = cpu_offline_stub, + .init = init_stub, + .exit = exit_stub, +}; + +struct bpf_struct_ops bpf_sched_ext_ops = { + .verifier_ops = &bpf_scx_verifier_ops, + .reg = bpf_scx_reg, + .unreg = bpf_scx_unreg, + .check_member = bpf_scx_check_member, + .init_member = bpf_scx_init_member, + .init = bpf_scx_init, + .update = bpf_scx_update, + .validate = bpf_scx_validate, + .name = "sched_ext_ops", + .cfi_stubs = &__bpf_ops_sched_ext_ops +}; + + +/******************************************************************************** + * System integration and init. + */ + +static void sysrq_handle_sched_ext_reset(u8 key) +{ + if (scx_ops_helper) + scx_ops_disable(SCX_EXIT_SYSRQ); + else + pr_info("sched_ext: BPF scheduler not yet used\n"); +} + +static const struct sysrq_key_op sysrq_sched_ext_reset_op = { + .handler = sysrq_handle_sched_ext_reset, + .help_msg = "reset-sched-ext(S)", + .action_msg = "Disable sched_ext and revert all tasks to CFS", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + +static void kick_cpus_irq_workfn(struct irq_work *irq_work) +{ + struct rq *this_rq = this_rq(); + unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + int this_cpu = cpu_of(this_rq); + int cpu; + + for_each_cpu(cpu, this_rq->scx.cpus_to_kick) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + if (cpu_online(cpu) || cpu == this_cpu) { + if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) && + rq->curr->sched_class == &ext_sched_class) + rq->curr->scx.slice = 0; + pseqs[cpu] = rq->scx.pnt_seq; + resched_curr(rq); + } else { + cpumask_clear_cpu(cpu, this_rq->scx.cpus_to_wait); + } + + raw_spin_rq_unlock_irqrestore(rq, flags); + } + + for_each_cpu_andnot(cpu, this_rq->scx.cpus_to_wait, + cpumask_of(this_cpu)) { + /* + * Pairs with smp_store_release() issued by this CPU in + * scx_notify_pick_next_task() on the resched path. + * + * We busy-wait here to guarantee that no other task can be + * scheduled on our core before the target CPU has entered the + * resched path. + */ + while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu]) + cpu_relax(); + } + + cpumask_clear(this_rq->scx.cpus_to_kick); + cpumask_clear(this_rq->scx.cpus_to_preempt); + cpumask_clear(this_rq->scx.cpus_to_wait); +} + +/** + * print_scx_info - print out sched_ext scheduler state + * @log_lvl: the log level to use when printing + * @p: target task + * + * If a sched_ext scheduler is enabled, print the name and state of the + * scheduler. If @p is on sched_ext, print further information about the task. + * + * This function can be safely called on any task as long as the task_struct + * itself is accessible. While safe, this function isn't synchronized and may + * print out mixups or garbages of limited length. + */ +void print_scx_info(const char *log_lvl, struct task_struct *p) +{ + enum scx_ops_enable_state state = scx_ops_enable_state(); + const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; + char runnable_at_buf[22] = "?"; + struct sched_class *class; + unsigned long runnable_at; + + if (state == SCX_OPS_DISABLED) + return; + + /* + * Carefully check if the task was running on sched_ext, and then + * carefully copy the time it's been runnable, and its state. + */ + if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || + class != &ext_sched_class) { + printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, + scx_ops_enable_state_str[state], all); + return; + } + + if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, + sizeof(runnable_at))) + scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+lldms", + (s64)(runnable_at - jiffies) * (HZ / MSEC_PER_SEC)); + + /* print everything onto one line to conserve console space */ + printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", + log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, + runnable_at_buf); +} + +static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) +{ + if (!scx_enabled()) + return NOTIFY_OK; + + /* + * SCX schedulers often have userspace components which are sometimes + * involved in critial scheduling paths. PM operations involve freezing + * userspace which can lead to scheduling misbehaviors including stalls. + * Let's bypass while PM operations are in progress. + */ + switch (event) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + case PM_RESTORE_PREPARE: + scx_ops_bypass(true); + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + case PM_POST_RESTORE: + scx_ops_bypass(false); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_pm_notifier = { + .notifier_call = scx_pm_handler, +}; + +void __init init_sched_ext_class(void) +{ + int cpu; + u32 v; + + /* + * The following is to prevent the compiler from optimizing out the enum + * definitions so that BPF scheduler implementations can use them + * through the generated vmlinux.h. + */ + WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | + SCX_TG_ONLINE | SCX_KICK_PREEMPT); + + BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); + init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); +#ifdef CONFIG_SMP + BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); +#endif + scx_kick_cpus_pnt_seqs = + __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * + num_possible_cpus(), + __alignof__(scx_kick_cpus_pnt_seqs[0])); + BUG_ON(!scx_kick_cpus_pnt_seqs); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + INIT_LIST_HEAD(&rq->scx.runnable_list); + + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + } + + register_sysrq_key('S', &sysrq_sched_ext_reset_op); + INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); + scx_cgroup_config_knobs(); +} + + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ +#include + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_switch_all - Switch all tasks into SCX + * + * Switch all existing and future non-dl/rt tasks to SCX. This can only be + * called from ops.init(), and actual switching is performed asynchronously. + */ +__bpf_kfunc static void scx_bpf_switch_all(void) +{ + if (!scx_kf_allowed(SCX_KF_INIT)) + return; + + scx_switch_all_req = true; +} + +BTF_SET8_START(scx_kfunc_ids_init) +BTF_ID_FLAGS(func, scx_bpf_switch_all) +BTF_SET8_END(scx_kfunc_ids_init) + +static const struct btf_kfunc_id_set scx_kfunc_set_init = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_init, +}; + +/** + * scx_bpf_create_dsq - Create a custom DSQ + * @dsq_id: DSQ to create + * @node: NUMA node to allocate from + * + * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(), + * ops.init_task(), ops.cgroup_init() and ops.cgroup_prep_move(). + */ +__bpf_kfunc static s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) +{ + if (!scx_kf_allowed(SCX_KF_INIT | SCX_KF_SLEEPABLE)) + return -EINVAL; + + if (unlikely(node >= (int)nr_node_ids || + (node < 0 && node != NUMA_NO_NODE))) + return -EINVAL; + return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); +} + +BTF_SET8_START(scx_kfunc_ids_sleepable) +BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_SET8_END(scx_kfunc_ids_sleepable) + +static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_sleepable, +}; + +__bpf_kfunc_end_defs(); + +static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) +{ + if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + return false; + + lockdep_assert_irqs_disabled(); + + if (unlikely(!p)) { + scx_ops_error("called with NULL task"); + return false; + } + + if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { + scx_ops_error("invalid enq_flags 0x%llx", enq_flags); + return false; + } + + return true; +} + +static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) +{ + struct task_struct *ddsp_task; + int idx; + + ddsp_task = __this_cpu_read(direct_dispatch_task); + if (ddsp_task) { + mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + return; + } + + idx = __this_cpu_read(scx_dsp_ctx.buf_cursor); + if (unlikely(idx >= scx_dsp_max_batch)) { + scx_ops_error("dispatch buffer overflow"); + return; + } + + this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){ + .task = p, + .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, + .dsq_id = dsq_id, + .enq_flags = enq_flags, + }; + __this_cpu_inc(scx_dsp_ctx.buf_cursor); +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe + * to call this function spuriously. Can be called from ops.enqueue(), + * ops.select_cpu(), and ops.dispatch(). + * + * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch + * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be + * used to target the local DSQ of a CPU other than the enqueueing one. Use + * ops.select_cpu() to be on the target CPU in the first place. + * + * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p + * will be directly dispatched to the corresponding dispatch queue after + * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be + * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). + * @enq_flags are OR'd with the enqueue flags on the enqueue path before the + * task is dispatched. + * + * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id + * and this function can be called upto ops.dispatch_max_batch times to dispatch + * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the + * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * + * This function doesn't have any locking restrictions and may be called under + * BPF locks (in the future when BPF introduces more flexible locking). + * + * @p is allowed to run for @slice. The scheduling path is triggered on slice + * exhaustion. If zero, the current residual slice is maintained. If + * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with + * scx_bpf_kick_cpu() to trigger scheduling. + */ +__bpf_kfunc +static void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + scx_dispatch_commit(p, dsq_id, enq_flags); +} + +/** + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime and always + * consumed after the tasks in the FIFO queue. All other aspects are identical + * to scx_bpf_dispatch(). + * + * @vtime ordering is according to time_before64() which considers wrapping. A + * numerically larger vtime may indicate an earlier position in the ordering and + * vice-versa. + */ +__bpf_kfunc +static void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, + u64 vtime, u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +BTF_SET8_START(scx_kfunc_ids_enqueue_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) +BTF_SET8_END(scx_kfunc_ids_enqueue_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_enqueue_dispatch, +}; + +/** + * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots + * + * Can only be called from ops.dispatch(). + */ +__bpf_kfunc static u32 scx_bpf_dispatch_nr_slots(void) +{ + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return 0; + + return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx.buf_cursor); +} + +/** + * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ + * @dsq_id: DSQ to consume + * + * Consume a task from the non-local DSQ identified by @dsq_id and transfer it + * to the current CPU's local DSQ for execution. Can only be called from + * ops.dispatch(). + * + * This function flushes the in-flight dispatches from scx_bpf_dispatch() before + * trying to consume the specified DSQ. It may also grab rq locks and thus can't + * be called under any BPF locks. + * + * Returns %true if a task has been consumed, %false if there isn't any task to + * consume. + */ +__bpf_kfunc static bool scx_bpf_consume(u64 dsq_id) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); + struct scx_dispatch_q *dsq; + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + flush_dispatch_buf(dspc->rq, dspc->rf); + + dsq = find_non_local_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + return false; + } + + if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { + /* + * A successfully consumed task can be dequeued before it starts + * running while the CPU is trying to migrate other dispatched + * tasks. Bump nr_tasks to tell balance_scx() to retry on empty + * local DSQ. + */ + dspc->nr_tasks++; + return true; + } else { + return false; + } +} + +BTF_SET8_START(scx_kfunc_ids_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) +BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_SET8_END(scx_kfunc_ids_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_dispatch, +}; + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + */ +__bpf_kfunc static u32 scx_bpf_reenqueue_local(void) +{ + u32 nr_enqueued, i; + struct rq *rq; + struct scx_rq *scx_rq; + + if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + scx_rq = &rq->scx; + + /* + * Get the number of tasks on the local DSQ before iterating over it to + * pull off tasks. The enqueue callback below can signal that it wants + * the task to stay on the local DSQ, and we want to prevent the BPF + * scheduler from causing us to loop indefinitely. + */ + nr_enqueued = scx_rq->local_dsq.nr; + for (i = 0; i < nr_enqueued; i++) { + struct task_struct *p; + + p = first_local_task(rq); + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != + SCX_OPSS_NONE); + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + WARN_ON_ONCE(p->scx.holding_cpu != -1); + dispatch_dequeue(scx_rq, p); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + } + + return nr_enqueued; +} + +BTF_SET8_START(scx_kfunc_ids_cpu_release) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_SET8_END(scx_kfunc_ids_cpu_release) + +static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cpu_release, +}; + +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc static void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct rq *rq; + + if (!ops_cpu_valid(cpu)) { + scx_ops_error("invalid cpu %d", cpu); + return; + } + + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ + if (scx_ops_bypassing()) + return; + + preempt_disable(); + rq = this_rq(); + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting + * rq locks. We can probably be smarter and avoid bouncing if called + * from ops which don't hold a rq lock. + */ + cpumask_set_cpu(cpu, rq->scx.cpus_to_kick); + if (flags & SCX_KICK_PREEMPT) + cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt); + if (flags & SCX_KICK_WAIT) + cpumask_set_cpu(cpu, rq->scx.cpus_to_wait); + + irq_work_queue(&rq->scx.kick_cpus_irq_work); + preempt_enable(); +} + +/** + * scx_bpf_dsq_nr_queued - Return the number of queued tasks + * @dsq_id: id of the DSQ + * + * Return the number of tasks in the DSQ matching @dsq_id. If not found, + * -%ENOENT is returned. Can be called from any non-sleepable online scx_ops + * operations. + */ +__bpf_kfunc static s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + + lockdep_assert(rcu_read_lock_any_held()); + + if (dsq_id == SCX_DSQ_LOCAL) { + return this_rq()->scx.local_dsq.nr; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (ops_cpu_valid(cpu)) + return cpu_rq(cpu)->scx.local_dsq.nr; + } else { + dsq = find_non_local_dsq(dsq_id); + if (dsq) + return dsq->nr; + } + return -ENOENT; +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc static bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return false; + } + + if (ops_cpu_valid(cpu)) + return test_and_clear_cpu_idle(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc +static s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return -EBUSY; + } + + return scx_pick_idle_cpu(cpus_allowed, flags); +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc +static s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + s32 cpu; + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc static const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return cpu_none_mask; + } + +#ifdef CONFIG_SMP + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc static const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (!static_branch_likely(&scx_builtin_idle_enabled)) { + scx_ops_error("built-in idle tracking is disabled"); + return cpu_none_mask; + } + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_masks.smt; + else + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + */ +__bpf_kfunc +static void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or + * releasing a reference to a global idle cpumask, which is read-only + * in the caller and is never released. The acquire / release semantics + * here are just used to make the cpumask is a trusted pointer in the + * caller. + */ +} + +struct scx_bpf_error_bstr_bufs { + u64 data[MAX_BPRINTF_VARARGS]; + char msg[SCX_EXIT_MSG_LEN]; +}; + +static DEFINE_PER_CPU(struct scx_bpf_error_bstr_bufs, scx_bpf_error_bstr_bufs); + +/** + * scx_bpf_error_bstr - Indicate fatal error + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler encountered a fatal error and initiate ops + * disabling. + */ +__bpf_kfunc +static void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) +{ + struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; + struct scx_bpf_error_bstr_bufs *bufs; + unsigned long flags; + int ret; + + local_irq_save(flags); + bufs = this_cpu_ptr(&scx_bpf_error_bstr_bufs); + + if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || + (data__sz && !data)) { + scx_ops_error("invalid data=%p and data__sz=%u", + (void *)data, data__sz); + goto out_restore; + } + + ret = copy_from_kernel_nofault(bufs->data, data, data__sz); + if (ret) { + scx_ops_error("failed to read data fields (%d)", ret); + goto out_restore; + } + + ret = bpf_bprintf_prepare(fmt, UINT_MAX, bufs->data, data__sz / 8, + &bprintf_data); + if (ret < 0) { + scx_ops_error("failed to format prepration (%d)", ret); + goto out_restore; + } + + ret = bstr_printf(bufs->msg, sizeof(bufs->msg), fmt, + bprintf_data.bin_args); + bpf_bprintf_cleanup(&bprintf_data); + if (ret < 0) { + scx_ops_error("scx_ops_error(\"%s\", %p, %u) failed to format", + fmt, data, data__sz); + goto out_restore; + } + + scx_ops_error_kind(SCX_EXIT_ERROR_BPF, "%s", bufs->msg); +out_restore: + local_irq_restore(flags); +} + +/** + * scx_bpf_destroy_dsq - Destroy a custom DSQ + * @dsq_id: DSQ to destroy + * + * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with + * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is + * empty and no further tasks are dispatched to it. Ignored if called on a DSQ + * which doesn't exist. Can be called from any online scx_ops operations. + */ +__bpf_kfunc static void scx_bpf_destroy_dsq(u64 dsq_id) +{ + destroy_dsq(dsq_id); +} + +/** + * scx_bpf_task_running - Is task currently running? + * @p: task of interest + */ +__bpf_kfunc static bool scx_bpf_task_running(const struct task_struct *p) +{ + return task_rq(p)->curr == p; +} + +/** + * scx_bpf_task_cpu - CPU a task is currently associated with + * @p: task of interest + */ +__bpf_kfunc static s32 scx_bpf_task_cpu(const struct task_struct *p) +{ + return task_cpu(p); +} + +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all + * rq-locked operations. Can be called on the parameter tasks of rq-locked + * operations. The restriction guarantees that @p's rq is locked by the caller. + */ +#ifdef CONFIG_CGROUP_SCHED +__bpf_kfunc static struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + + if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + goto out; + + /* + * A task_group may either be a cgroup or an autogroup. In the latter + * case, @tg->css.cgroup is %NULL. A task_group can't become the other + * kind once created. + */ + if (tg && tg->css.cgroup) + cgrp = tg->css.cgroup; + else + cgrp = &cgrp_dfl_root.cgrp; +out: + cgroup_get(cgrp); + return cgrp; +} +#endif + +BTF_SET8_START(scx_kfunc_ids_ops_only) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_SET8_END(scx_kfunc_ids_ops_only) + +static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_ops_only, +}; + +BTF_SET8_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) +#endif +BTF_SET8_END(scx_kfunc_ids_any) + +static const struct btf_kfunc_id_set scx_kfunc_set_any = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_any, +}; + +__bpf_kfunc_end_defs(); + +static int __init scx_init(void) +{ + int ret; + + /* + * kfunc registration can't be done from init_sched_ext_class() as + * register_btf_kfunc_id_set() needs most of the system to be up. + * + * Some kfuncs are context-sensitive and can only be called from + * specific SCX ops. They are grouped into BTF sets accordingly. + * Unfortunately, BPF currently doesn't have a way of enforcing such + * restrictions. Eventually, the verifier should be able to enforce + * them. For now, register them the same and make each kfunc explicitly + * check using scx_kf_allowed(). + */ + if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_init)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_sleepable)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_enqueue_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_ops_only)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &scx_kfunc_set_any))) { + pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); + return ret; + } + + ret = register_pm_notifier(&scx_pm_notifier); + if (ret) { + pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); + return ret; + } + + scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); + if (!scx_kset) { + pr_err("sched_ext: Failed to create /sys/sched_ext\n"); + return -ENOMEM; + } + + ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); + if (ret < 0) { + pr_err("sched_ext: Failed to add global attributes\n"); + return ret; + } + + return 0; +} +__initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h new file mode 100644 index 0000000000000..3aa6598ad2312 --- /dev/null +++ b/kernel/sched/ext.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_EXEC = WF_EXEC, + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dispatch() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * If the BPF scheduler wants to continue executing the task, + * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. + * If the task gets queued on a different dsq or the BPF side, the BPF + * scheduler is responsible for triggering a follow-up scheduling event. + * Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ +}; + +enum scx_kick_flags { + SCX_KICK_PREEMPT = 1LLU << 0, /* force scheduling on the CPU */ + SCX_KICK_WAIT = 1LLU << 1, /* wait for the CPU to be rescheduled */ +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +#ifdef CONFIG_SCHED_CLASS_EXT + +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +extern const struct sched_class ext_sched_class; +extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops; +extern unsigned long scx_watchdog_timeout; +extern unsigned long scx_watchdog_timestamp; + +DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); +#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); + +static inline bool task_on_scx(const struct task_struct *p) +{ + return scx_enabled() && p->sched_class == &ext_sched_class; +} + +bool task_should_scx(struct task_struct *p); +void scx_pre_fork(struct task_struct *p); +int scx_fork(struct task_struct *p); +void scx_post_fork(struct task_struct *p); +void scx_cancel_fork(struct task_struct *p); +int scx_check_setscheduler(struct task_struct *p, int policy); +bool scx_can_stop_tick(struct rq *rq); +void init_sched_ext_class(void); + +__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind, + const char *fmt, ...); +#define scx_ops_error(fmt, args...) \ + scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) + +void __scx_notify_pick_next_task(struct rq *rq, + struct task_struct *p, + const struct sched_class *active); + +static inline void scx_notify_pick_next_task(struct rq *rq, + struct task_struct *p, + const struct sched_class *active) +{ + if (!scx_enabled()) + return; +#ifdef CONFIG_SMP + /* + * Pairs with the smp_load_acquire() issued by a CPU in + * kick_cpus_irq_workfn() who is waiting for this CPU to perform a + * resched. + */ + smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); +#endif + if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + return; + __scx_notify_pick_next_task(rq, p, active); +} + +static inline void scx_notify_sched_tick(void) +{ + unsigned long last_check; + + if (!scx_enabled()) + return; + + last_check = READ_ONCE(scx_watchdog_timestamp); + if (unlikely(time_after(jiffies, + last_check + READ_ONCE(scx_watchdog_timeout)))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } +} + +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; + return class; +} + +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) + +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +/* + * SCX requires a balance() call before every pick_next_task() call including + * when waking up from idle. + */ +#define for_balance_class_range(class, prev_class, end_class) \ + for_active_class_range(class, (prev_class) > &ext_sched_class ? \ + &ext_sched_class : (prev_class), (end_class)) + +#ifdef CONFIG_SCHED_CORE +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi); +#endif + +#else /* CONFIG_SCHED_CLASS_EXT */ + +#define scx_enabled() false +#define scx_switched_all() false + +static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline void scx_pre_fork(struct task_struct *p) {} +static inline int scx_fork(struct task_struct *p) { return 0; } +static inline void scx_post_fork(struct task_struct *p) {} +static inline void scx_cancel_fork(struct task_struct *p) {} +static inline int scx_check_setscheduler(struct task_struct *p, + int policy) { return 0; } +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } +static inline void init_sched_ext_class(void) {} +static inline void scx_notify_pick_next_task(struct rq *rq, + const struct task_struct *p, + const struct sched_class *active) {} +static inline void scx_notify_sched_tick(void) {} + +#define for_each_active_class for_each_class +#define for_balance_class_range for_class_range + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +void __scx_update_idle(struct rq *rq, bool idle); + +static inline void scx_update_idle(struct rq *rq, bool idle) +{ + if (scx_enabled()) + __scx_update_idle(rq, idle); +} +#else +static inline void scx_update_idle(struct rq *rq, bool idle) {} +#endif + +#ifdef CONFIG_CGROUP_SCHED +#ifdef CONFIG_EXT_GROUP_SCHED +int scx_tg_online(struct task_group *tg); +void scx_tg_offline(struct task_group *tg); +int scx_cgroup_can_attach(struct cgroup_taskset *tset); +void scx_move_task(struct task_struct *p); +void scx_cgroup_finish_attach(void); +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); +void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); +#else /* CONFIG_EXT_GROUP_SCHED */ +static inline int scx_tg_online(struct task_group *tg) { return 0; } +static inline void scx_tg_offline(struct task_group *tg) {} +static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } +static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_finish_attach(void) {} +static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} +static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} +#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 533547e3c90a7..4f398762aef2c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3829,7 +3829,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } -void reweight_task(struct task_struct *p, int prio) +static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -8310,7 +8310,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -12458,14 +12458,14 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); } -static void rq_online_fair(struct rq *rq) +static void rq_online_fair(struct rq *rq, enum rq_onoff_reason reason) { update_sysctl(); update_runtime_enabled(rq); } -static void rq_offline_fair(struct rq *rq) +static void rq_offline_fair(struct rq *rq, enum rq_onoff_reason reason) { update_sysctl(); @@ -13147,6 +13147,7 @@ DEFINE_SCHED_CLASS(fair) = { .task_tick = task_tick_fair, .task_fork = task_fork_fair, + .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 31231925f1ece..0179ce0c70c12 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -438,11 +438,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + scx_update_idle(rq, false); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); + scx_update_idle(rq, true); schedstat_inc(rq->sched_goidle); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b67e2..8620474d117dc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2426,7 +2426,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) } /* Assumes rq->lock is held */ -static void rq_online_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq, enum rq_onoff_reason reason) { if (rq->rt.overloaded) rt_set_overload(rq); @@ -2437,7 +2437,7 @@ static void rq_online_rt(struct rq *rq) } /* Assumes rq->lock is held */ -static void rq_offline_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq, enum rq_onoff_reason reason) { if (rq->rt.overloaded) rt_clear_overload(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 001fe047bd5d8..3cede157b6a62 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -171,9 +171,19 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + static inline int fair_policy(int policy) { - return policy == SCHED_NORMAL || policy == SCHED_BATCH; + return normal_policy(policy) || policy == SCHED_BATCH; } static inline int rt_policy(int policy) @@ -221,6 +231,24 @@ static inline void update_avg(u64 *avg, u64 sample) #define shr_bound(val, shift) \ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) +/* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + /* * !! For sched_setattr_nocheck() (kernel) only !! * @@ -404,6 +432,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; @@ -459,6 +492,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + extern int tg_nop(struct task_group *tg, void *data); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -515,6 +553,11 @@ extern void set_task_rq_fair(struct sched_entity *se, static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } #endif /* CONFIG_SMP */ +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + return 0; +} #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -675,6 +718,28 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + SCX_RQ_CAN_STOP_TICK = 1 << 0, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head runnable_list; /* runnable tasks on this rq */ + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 flags; + bool cpu_released; + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -1015,6 +1080,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -2248,6 +2316,11 @@ extern const u32 sched_prio_to_wmult[40]; #define RETRY_TASK ((void *)-1UL) +enum rq_onoff_reason { + RQ_ONOFF_HOTPLUG, /* CPU is going on/offline */ + RQ_ONOFF_TOPOLOGY, /* sched domain topology update */ +}; + struct affinity_context { const struct cpumask *new_mask; struct cpumask *user_mask; @@ -2286,8 +2359,8 @@ struct sched_class { void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); - void (*rq_online)(struct rq *rq); - void (*rq_offline)(struct rq *rq); + void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason); + void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif @@ -2301,8 +2374,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + int newprio); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); @@ -2460,7 +2536,7 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); +extern void __setscheduler_prio(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -2540,6 +2616,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT @@ -2821,8 +2903,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) raw_spin_rq_unlock(rq1); } -extern void set_rq_online (struct rq *rq); -extern void set_rq_offline(struct rq *rq); +extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason); +extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason); extern bool sched_smp_initialized; #else /* CONFIG_SMP */ @@ -3473,4 +3555,27 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_CGROUP_SCHED +enum cpu_cftype_id { +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) + CPU_CFTYPE_WEIGHT, + CPU_CFTYPE_WEIGHT_NICE, + CPU_CFTYPE_IDLE, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + CPU_CFTYPE_MAX, + CPU_CFTYPE_MAX_BURST, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + CPU_CFTYPE_UCLAMP_MIN, + CPU_CFTYPE_UCLAMP_MAX, +#endif + CPU_CFTYPE_CNT, +}; + +extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1]; +#endif /* CONFIG_CGROUP_SCHED */ + +#include "ext.h" + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 10d1391e74161..7798063b3a383 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -497,7 +497,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) old_rd = rq->rd; if (cpumask_test_cpu(rq->cpu, old_rd->online)) - set_rq_offline(rq); + set_rq_offline(rq, RQ_ONOFF_TOPOLOGY); cpumask_clear_cpu(rq->cpu, old_rd->span); @@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_set_cpu(rq->cpu, rd->span); if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); + set_rq_online(rq, RQ_ONOFF_TOPOLOGY); rq_unlock_irqrestore(rq, &rf); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 83471e81501a7..6e667c445539b 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); + print_scx_info(log_lvl, current); } /** diff --git a/tools/Makefile b/tools/Makefile index 37e9f68048326..8021267f7e5b6 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -29,6 +29,7 @@ help: @echo ' pci - PCI tools' @echo ' perf - Linux performance measurement and analysis tool' @echo ' selftests - various kernel selftests' + @echo ' sched_ext - sched_ext example schedulers' @echo ' bootconfig - boot config tool' @echo ' spi - spi tools' @echo ' tmon - thermal monitoring and tuning tool' @@ -92,6 +93,9 @@ perf: FORCE $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= +sched_ext: FORCE + $(call descend,sched_ext) + selftests: FORCE $(call descend,testing/$@) @@ -185,6 +189,9 @@ perf_clean: $(Q)mkdir -p $(PERF_O) . $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean +sched_ext_clean: + $(call descend,sched_ext,clean) + selftests_clean: $(call descend,testing/$(@:_clean=),clean) @@ -214,6 +221,7 @@ clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_cl mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ - intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean + intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ + sched_ext_clean .PHONY: FORCE diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore new file mode 100644 index 0000000000000..215ed36b2a94a --- /dev/null +++ b/tools/sched_ext/.gitignore @@ -0,0 +1,10 @@ +scx_simple +scx_qmap +scx_central +scx_pair +scx_flatcg +scx_userland +*.skel.h +*.subskel.h +/tools/ +build/ diff --git a/tools/sched_ext/Kconfig b/tools/sched_ext/Kconfig new file mode 100644 index 0000000000000..6543fcf199f6e --- /dev/null +++ b/tools/sched_ext/Kconfig @@ -0,0 +1,9 @@ +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile new file mode 100644 index 0000000000000..7db68d2053765 --- /dev/null +++ b/tools/sched_ext/Makefile @@ -0,0 +1,301 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../build/Build.include +include ../scripts/Makefile.arch +include ../scripts/Makefile.include + +all: all_targets + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi +CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu +CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl +CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu +CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu +CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu +CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu +CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu +CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +ifeq ($(CROSS_COMPILE),) +ifeq ($(CLANG_TARGET_FLAGS),) +$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) +else +CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) +endif # CLANG_TARGET_FLAGS +else +CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif # CROSS_COMPILE + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := $(CROSS_COMPILE)gcc +endif # LLVM + +CURDIR := $(abspath .) +TOOLSDIR := $(abspath ..) +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(abspath ../../include/generated) +GENHDR := $(GENDIR)/autoconf.h + +ifeq ($(O),) +OUTPUT_DIR := $(CURDIR)/build +else +OUTPUT_DIR := $(O)/build +endif # O +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BINDIR := $(OUTPUT_DIR)/bin +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +ifneq ($(CROSS_COMPILE),) +HOST_BUILD_DIR := $(OBJ_DIR)/host +HOST_OUTPUT_DIR := host-tools +HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include +else +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) +HOST_INCLUDE_DIR := $(INCLUDE_DIR) +endif +HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a +RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids +DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include + +CARGOFLAGS := --release --target-dir $(OUTPUT_DIR) +ifneq ($(CARGO_OFFLINE),) +CARGOFLAGS += --offline +endif + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ + | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) + +################ +# C schedulers # +################ +c-sched-targets = scx_simple scx_qmap scx_central scx_pair scx_flatcg \ + scx_userland + +$(addprefix $(BINDIR)/,$(c-sched-targets)): \ + $(BINDIR)/%: \ + $(filter-out %.bpf.c,%.c) \ + $(INCLUDE_DIR)/%.bpf.skel.h \ + $(SCX_COMMON_DEPS) + $(eval sched=$(notdir $@)) + $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o + $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) +$(c-sched-targets): %: $(BINDIR)/% + + +################### +# Rust schedulers # +################### +rust-sched-targets := scx_rusty scx_layered + +# Separate build target that is available for build systems to use to fetch +# dependencies in a separate step from building. This allows the scheduler +# to be compiled without network access. +# +# If the regular rust scheduler Make target (e.g. scx_rusty) is invoked without +# CARGO_OFFLINE=1 (e.g. if building locally), then cargo build will download +# all of the necessary dependencies, and the deps target can be skipped. +$(addsuffix _deps,$(rust-sched-targets)): + $(eval sched=$(@:_deps=)) + $(Q)cargo fetch --manifest-path=$(sched)/Cargo.toml + +$(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS) + $(eval export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)) + $(eval export BPF_CLANG = $(CLANG)) + $(eval export BPF_CFLAGS = $(BPF_CFLAGS)) + $(eval sched=$(notdir $@)) + $(Q)cargo build --manifest-path=$(sched)/Cargo.toml $(CARGOFLAGS) + $(Q)cp $(OUTPUT_DIR)/release/$(sched) $(BINDIR)/$@ + +install: all + $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ + $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ + +clean: + $(foreach sched,$(rust-sched-targets),cargo clean --manifest-path=$(sched)/Cargo.toml;) + rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(c-sched-targets) + +help: + @echo 'Building targets' + @echo '================' + @echo '' + @echo ' all - Compile all schedulers' + @echo '' + @echo 'Alternatively, you may compile individual schedulers:' + @echo '' + @printf ' %s\n' $(c-sched-targets) $(rust-sched-targets) + @echo '' + @echo 'For any scheduler build target, you may specify an alternative' + @echo 'build output path with the O= environment variable. For example:' + @echo '' + @echo ' O=/tmp/sched_ext make all' + @echo '' + @echo 'will compile all schedulers, and emit the build artifacts to' + @echo '/tmp/sched_ext/build.' + @echo '' + @echo '' + @echo 'Rust scheduler targets' + @echo '======================' + @echo '' + @printf ' %s\n' $(rust-sched-targets) + @printf ' %s_deps\n' $(rust-sched-targets) + @echo '' + @echo 'For any rust schedulers built with cargo, you can specify' + @echo 'CARGO_OFFLINE=1 to ensure the build portion does not access the' + @echo 'network (e.g. if the scheduler is being packaged).' + @echo '' + @echo 'For such use cases, the build workflow will look something like this:' + @echo '' + @echo ' make scx_rusty_deps' + @echo ' CARGO_OFFLINE=1 make scx_rusty' + @echo '' + @echo 'If network access during build is allowed, you can just make scx_rusty' + @echo 'directly without CARGO_OFFLINE, and dependencies will be downloaded' + @echo 'during the build step.' + @echo '' + @echo '' + @echo 'Installing targets' + @echo '==================' + @echo '' + @echo ' install - Compile and install all schedulers to /usr/bin.' + @echo ' You may specify the DESTDIR= environment variable' + @echo ' to indicate a prefix for /usr/bin. For example:' + @echo '' + @echo ' DESTDIR=/tmp/sched_ext make install' + @echo '' + @echo ' will build the schedulers in CWD/build, and' + @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' + @echo '' + @echo '' + @echo 'Cleaning targets' + @echo '================' + @echo '' + @echo ' clean - Remove all generated files, including intermediate' + @echo ' rust files for rust schedulers.' + +all_targets: $(c-sched-targets) $(rust-sched-targets) + +.PHONY: all all_targets $(c-sched-targets) $(rust-sched-targets) clean help + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md new file mode 100644 index 0000000000000..8e7194ada3310 --- /dev/null +++ b/tools/sched_ext/README.md @@ -0,0 +1,403 @@ +SCHED_EXT EXAMPLE SCHEDULERS +============================ + +# Introduction + +This directory contains a number of example sched_ext schedulers. These +schedulers are meant to provide examples of different types of schedulers +that can be built using sched_ext, and illustrate how various features of +sched_ext can be used. + +Some of the examples are performant, production-ready schedulers. That is, for +the correct workload and with the correct tuning, they may be deployed in a +production environment with acceptable or possibly even improved performance. +Others are just examples that in practice, would not provide acceptable +performance (though they could be improved to get there). + +This README will describe these example schedulers, including describing the +types of workloads or scenarios they're designed to accommodate, and whether or +not they're production ready. For more details on any of these schedulers, +please see the header comment in their .bpf.c file. + + +# Compiling the examples + +There are a few toolchain dependencies for compiling the example schedulers. + +## Toolchain dependencies + +1. clang >= 16.0.0 + +The schedulers are BPF programs, and therefore must be compiled with clang. gcc +is actively working on adding a BPF backend compiler as well, but are still +missing some features such as BTF type tags which are necessary for using +kptrs. + +2. pahole >= 1.25 + +You may need pahole in order to generate BTF from DWARF. + +3. rust >= 1.70.0 + +Rust schedulers uses features present in the rust toolchain >= 1.70.0. You +should be able to use the stable build from rustup, but if that doesn't +work, try using the rustup nightly build. + +There are other requirements as well, such as make, but these are the main / +non-trivial ones. + +## Compiling the kernel + +In order to run a sched_ext scheduler, you'll have to run a kernel compiled +with the patches in this repository, and with a minimum set of necessary +Kconfig options: + +``` +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +``` + +It's also recommended that you also include the following Kconfig options: + +``` +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y +``` + +There is a `Kconfig` file in this directory whose contents you can append to +your local `.config` file, as long as there are no conflicts with any existing +options in the file. + +## Getting a vmlinux.h file + +You may notice that most of the example schedulers include a "vmlinux.h" file. +This is a large, auto-generated header file that contains all of the types +defined in some vmlinux binary that was compiled with +[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig +options specified above). + +The header file is created using `bpftool`, by passing it a vmlinux binary +compiled with BTF as follows: + +```bash +$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h +``` + +`bpftool` analyzes all of the BTF encodings in the binary, and produces a +header file that can be included by BPF programs to access those types. For +example, using vmlinux.h allows a scheduler to access fields defined directly +in vmlinux as follows: + +```c +#include "vmlinux.h" +// vmlinux.h is also implicitly included by scx_common.bpf.h. +#include "scx_common.bpf.h" + +/* + * vmlinux.h provides definitions for struct task_struct and + * struct scx_enable_args. + */ +void BPF_STRUCT_OPS(example_enable, struct task_struct *p, + struct scx_enable_args *args) +{ + bpf_printk("Task %s enabled in example scheduler", p->comm); +} + +// vmlinux.h provides the definition for struct sched_ext_ops. +SEC(".struct_ops.link") +struct sched_ext_ops example_ops { + .enable = (void *)example_enable, + .name = "example", +} +``` + +The scheduler build system will generate this vmlinux.h file as part of the +scheduler build pipeline. It looks for a vmlinux file in the following +dependency order: + +1. If the O= environment variable is defined, at `$O/vmlinux` +2. If the KBUILD_OUTPUT= environment variable is defined, at + `$KBUILD_OUTPUT/vmlinux` +3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're + compiling the schedulers) +3. `/sys/kernel/btf/vmlinux` +4. `/boot/vmlinux-$(uname -r)` + +In other words, if you have compiled a kernel in your local repo, its vmlinux +file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of +the kernel you're currently running on. This means that if you're running on a +kernel with sched_ext support, you may not need to compile a local kernel at +all. + +### Aside on CO-RE + +One of the cooler features of BPF is that it supports +[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run +Everywhere). This feature allows you to reference fields inside of structs with +types defined internal to the kernel, and not have to recompile if you load the +BPF program on a different kernel with the field at a different offset. In our +example above, we print out a task name with `p->comm`. CO-RE would perform +relocations for that access when the program is loaded to ensure that it's +referencing the correct offset for the currently running kernel. + +## Compiling the schedulers + +Once you have your toolchain setup, and a vmlinux that can be used to generate +a full vmlinux.h file, you can compile the schedulers using `make`: + +```bash +$ make -j($nproc) +``` + +# Schedulers + +This section lists, in alphabetical order, all of the current example +schedulers. + +-------------------------------------------------------------------------------- + +## scx_simple + +### Overview + +A simple scheduler that provides an example of a minimal sched_ext +scheduler. scx_simple can be run in either global weighted vtime mode, or +FIFO mode. + +### Typical Use Case + +Though very simple, this scheduler should perform reasonably well on +single-socket CPUs with a uniform L3 cache topology. Note that while running in +global FIFO mode may work well for some workloads, saturating threads can +easily drown out inactive ones. + +### Production Ready? + +This scheduler could be used in a production environment, assuming the hardware +constraints enumerated above, and assuming the workload can accommodate a +simple scheduling policy. + +-------------------------------------------------------------------------------- + +## scx_qmap + +### Overview + +Another simple, yet slightly more complex scheduler that provides an example of +a basic weighted FIFO queuing policy. It also provides examples of some common +useful BPF features, such as sleepable per-task storage allocation in the +`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to +enqueue tasks. It also illustrates how core-sched support could be implemented. + +### Typical Use Case + +Purely used to illustrate sched_ext features. + +### Production Ready? + +No + +-------------------------------------------------------------------------------- + +## scx_central + +### Overview + +A "central" scheduler where scheduling decisions are made from a single CPU. +This scheduler illustrates how scheduling decisions can be dispatched from a +single CPU, allowing other cores to run with infinite slices, without timer +ticks, and without having to incur the overhead of making scheduling decisions. + +### Typical Use Case + +This scheduler could theoretically be useful for any workload that benefits +from minimizing scheduling overhead and timer ticks. An example of where this +could be particularly useful is running VMs, where running with infinite slices +and no timer ticks allows the VM to avoid unnecessary expensive vmexits. + +### Production Ready? + +Not yet. While tasks are run with an infinite slice (SCX_SLICE_INF), they're +preempted every 20ms in a timer callback. The scheduler also puts the core +schedling logic inside of the central / scheduling CPU's ops.dispatch() path, +and does not yet have any kind of priority mechanism. + +-------------------------------------------------------------------------------- + +## scx_pair + +### Overview + +A sibling scheduler which ensures that tasks will only ever be co-located on a +physical core if they're in the same cgroup. It illustrates how a scheduling +policy could be implemented to mitigate CPU bugs, such as L1TF, and also shows +how some useful kfuncs such as `scx_bpf_kick_cpu()` can be utilized. + +### Typical Use Case + +While this scheduler is only meant to be used to illustrate certain sched_ext +features, with a bit more work (e.g. by adding some form of priority handling +inside and across cgroups), it could have been used as a way to quickly +mitigate L1TF before core scheduling was implemented and rolled out. + +### Production Ready? + +No + +-------------------------------------------------------------------------------- + +## scx_flatcg + +### Overview + +A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical +weight-based cgroup CPU control by flattening the cgroup hierarchy into a +single layer, by compounding the active weight share at each level. The effect +of this is a much more performant CPU controller, which does not need to +descend down cgroup trees in order to properly compute a cgroup's share. + +### Typical Use Case + +This scheduler could be useful for any typical workload requiring a CPU +controller, but which cannot tolerate the higher overheads of the fair CPU +controller. + +### Production Ready? + +Yes, though the scheduler (currently) does not adequately accommodate +thundering herds of cgroups. If, for example, many cgroups which are nested +behind a low-priority cgroup were to wake up around the same time, they may be +able to consume more CPU cycles than they are entitled to. + +-------------------------------------------------------------------------------- + +## scx_userland + +### Overview + +A simple weighted vtime scheduler where all scheduling decisions take place in +user space. This is in contrast to Rusty, where load balancing lives in user +space, but scheduling decisions are still made in the kernel. + +### Typical Use Case + +There are many advantages to writing schedulers in user space. For example, you +can use a debugger, you can write the scheduler in Rust, and you can use data +structures bundled with your favorite library. + +On the other hand, user space scheduling can be hard to get right. You can +potentially deadlock due to not scheduling a task that's required for the +scheduler itself to make forward progress (though the sched_ext watchdog will +protect the system by unloading your scheduler after a timeout if that +happens). You also have to bootstrap some communication protocol between the +kernel and user space. + +A more robust solution to this would be building a user space scheduling +framework that abstracts much of this complexity away from you. + +### Production Ready? + +No. This scheduler uses an ordered list for vtime scheduling, and is stricly +less performant than just using something like `scx_simple`. It is purely +meant to illustrate that it's possible to build a user space scheduler on +top of sched_ext. + +-------------------------------------------------------------------------------- + +## scx_rusty + +### Overview + +A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the +scheduler does a simple round robin in each domain, and the user space portion +(written in Rust) calculates the load factor of each domain, and informs BPF of +how tasks should be load balanced accordingly. + +### Typical Use Case + +Rusty is designed to be flexible, and accommodate different architectures and +workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc), +as well as how Rusty should partition the system into scheduling domains, can +be tuned to achieve the optimal configuration for any given system or workload. + +### Production Ready? + +Yes. If tuned correctly, rusty should be performant across various CPU +architectures and workloads. Rusty by default creates a separate scheduling +domain per-LLC, so its default configuration may be performant as well. + +That said, you may run into an issue with infeasible weights, where a task with +a very high weight may cause the scheduler to incorrectly leave cores idle +because it thinks they're necessary to accommodate the compute for a single +task. This can also happen in CFS, and should soon be addressed for rusty. + +-------------------------------------------------------------------------------- + +# Troubleshooting + +There are a number of common issues that you may run into when building the +schedulers. We'll go over some of the common ones here. + +## Build Failures + +### Old version of clang + +``` +error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + ^~~~~~~~~~~~~~~~~~~~ +1 error generated. +``` + +This means you built the kernel or the schedulers with an older version of +clang than what's supported (i.e. older than 16.0.0). To remediate this: + +1. `which clang` to make sure you're using a sufficiently new version of clang. + +2. `make fullclean` in the root path of the repository, and rebuild the kernel + and schedulers. + +3. Rebuild the kernel, and then your example schedulers. + +The schedulers are also cleaned if you invoke `make mrproper` in the root +directory of the tree. + +### Stale kernel build / incomplete vmlinux.h file + +As described above, you'll need a `vmlinux.h` file that was generated from a +vmlinux built with BTF, and with sched_ext support enabled. If you don't, +you'll see errors such as the following which indicate that a type being +referenced in a scheduler is unknown: + +``` +/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' + +const struct scx_exit_info *ei) + +^ +``` + +In order to resolve this, please follow the steps above in +[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your +schedulers are using a vmlinux.h file that includes the requisite types. + +## Misc + +### llvm: [OFF] + +You may see the following output when building the schedulers: + +``` +Auto-detecting system features: +... clang-bpf-co-re: [ on ] +... llvm: [ OFF ] +... libcap: [ on ] +... libbfd: [ on ] +``` + +Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h new file mode 100644 index 0000000000000..ad7d139ce907b --- /dev/null +++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h @@ -0,0 +1,11 @@ +/* + * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when + * compiling BPF files although its content doesn't play any role. The file in + * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is + * defined. When compiling a BPF source, __x86_64__ isn't set and thus + * stubs-32.h is selected. However, the file is not there if the system doesn't + * have 32bit glibc devel package installed leading to a build failure. + * + * The problem is worked around by making this file available in the include + * search paths before the system one when building BPF. + */ diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h new file mode 100644 index 0000000000000..f2336d357106e --- /dev/null +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -0,0 +1,245 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __SCHED_EXT_COMMON_BPF_H +#define __SCHED_EXT_COMMON_BPF_H + +#include "vmlinux.h" +#include +#include +#include +#include "user_exit_info.h" + +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_EXITING 0x00000004 +#define CLOCK_MONOTONIC 1 + +/* + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can + * lead to really confusing misbehaviors. Let's trigger a build failure. + */ +static inline void ___vmlinux_h_sanity_check___(void) +{ + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); +} + +void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_error_format_checker(const char *fmt, ...) {} + +/* + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments + * instead of an array of u64. Note that __param[] must have at least one + * element to keep the verifier happy. + */ +#define scx_bpf_error(fmt, args...) \ +({ \ + static char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ + \ + ___scx_bpf_error_format_checker(fmt, ##args); \ +}) + +void scx_bpf_switch_all(void) __ksym; +s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; +bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; +void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; +void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; +u32 scx_bpf_reenqueue_local(void) __ksym; + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, ##args) + +#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ +SEC("struct_ops.s/"#name) \ +BPF_PROG(name, ##args) + +/** + * RESIZABLE_ARRAY - Generates annotations for an array that may be resized + * @elfsec: the data section of the BPF program in which to place the array + * @arr: the name of the array + * + * libbpf has an API for setting map value sizes. Since data sections (i.e. + * bss, data, rodata) themselves are maps, a data section can be resized. If + * a data section has an array as its last element, the BTF info for that + * array will be adjusted so that length of the array is extended to meet the + * new length of the data section. This macro annotates an array to have an + * element count of one with the assumption that this array can be resized + * within the userspace program. It also annotates the section specifier so + * this array exists in a custom sub data section which can be resized + * independently. + * + * See RESIZE_ARRAY() for the userspace convenience macro for resizing an + * array declared with RESIZABLE_ARRAY(). + */ +#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) + +/** + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member + * @base: struct or array to index + * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) + * + * The verifier often gets confused by the instruction sequence the compiler + * generates for indexing struct fields or arrays. This macro forces the + * compiler to generate a code sequence which first calculates the byte offset, + * checks it against the struct or array size and add that byte offset to + * generate the pointer to the member to help the verifier. + * + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, + * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller + * must check for %NULL and take appropriate action to appease the verifier. To + * avoid confusing the verifier, it's best to check for %NULL and dereference + * immediately. + * + * vptr = MEMBER_VPTR(my_array, [i][j]); + * if (!vptr) + * return error; + * *vptr = new_value; + * + * sizeof(@base) should encompass the memory area to be accessed and thus can't + * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of + * `MEMBER_VPTR(ptr, ->member)`. + */ +#define MEMBER_VPTR(base, member) (typeof((base) member) *)({ \ + u64 __base = (u64)&(base); \ + u64 __addr = (u64)&((base) member) - __base; \ + _Static_assert(sizeof(base) >= sizeof((base) member), \ + "@base is smaller than @member, is @base a pointer?"); \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"i"(sizeof(base) - sizeof((base) member))); \ + __addr; \ +}) + +/** + * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element + * @arr: array to index into + * @i: array index + * @n: number of elements in array + * + * Similar to MEMBER_VPTR() but is intended for use with arrays where the + * element count needs to be explicit. + * It can be used in cases where a global array is defined with an initial + * size but is intended to be be resized before loading the BPF program. + * Without this version of the macro, MEMBER_VPTR() will use the compile time + * size of the array to compute the max, which will result in rejection by + * the verifier. + */ +#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({ \ + u64 __base = (u64)arr; \ + u64 __addr = (u64)&(arr[i]) - __base; \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ + __addr; \ +}) + +/* + * BPF core and other generic helpers + */ + +/* list and rbtree */ +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; +void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; +struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; +int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) + +struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +/* task */ +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +/* cgroup */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; + +/* cpumask */ +struct bpf_cpumask *bpf_cpumask_create(void) __ksym; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; +u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, + const struct cpumask *src2) __ksym; + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + +#endif /* __SCHED_EXT_COMMON_BPF_H */ diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h new file mode 100644 index 0000000000000..7019d9f2da603 --- /dev/null +++ b/tools/sched_ext/include/scx/common.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#ifndef __SCHED_EXT_COMMON_H +#define __SCHED_EXT_COMMON_H + +#ifdef __KERNEL__ +#error "Should not be included by BPF programs" +#endif + +#include +#include +#include +#include + +#include "user_exit_info.h" + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#define SCX_BUG(__fmt, ...) \ + do { \ + fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__, \ + strerror(errno)); \ + fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + \ + exit(EXIT_FAILURE); \ + } while (0) + +#define SCX_BUG_ON(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ + } while (0) + +/** + * RESIZE_ARRAY - Convenience macro for resizing a BPF array + * @elfsec: the data section of the BPF program in which to the array exists + * @arr: the name of the array + * @n: the desired array element count + * + * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two + * operations. It resizes the map which corresponds to the custom data + * section that contains the target array. As a side effect, the BTF info for + * the array is adjusted so that the array length is sized to cover the new + * data section size. The second operation is reassigning the skeleton pointer + * for that custom data section so that it points to the newly memory mapped + * region. + */ +#define RESIZE_ARRAY(elfsec, arr, n) \ + do { \ + size_t __sz; \ + bpf_map__set_value_size(skel->maps.elfsec##_##arr, \ + sizeof(skel->elfsec##_##arr->arr[0]) * (n)); \ + skel->elfsec##_##arr = \ + bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \ + } while (0) + +#endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/ravg.bpf.h b/tools/sched_ext/include/scx/ravg.bpf.h new file mode 100644 index 0000000000000..a233d85d05aa6 --- /dev/null +++ b/tools/sched_ext/include/scx/ravg.bpf.h @@ -0,0 +1,42 @@ +#ifndef __SCX_RAVG_BPF_H__ +#define __SCX_RAVG_BPF_H__ + +/* + * Running average helpers to be used in BPF progs. Assumes vmlinux.h has + * already been included. + */ +enum ravg_consts { + RAVG_VAL_BITS = 44, /* input values are 44bit */ + RAVG_FRAC_BITS = 20, /* 1048576 is 1.0 */ +}; + +/* + * Running avg mechanism. Accumulates values between 0 and RAVG_MAX_VAL in + * arbitrary time intervals. The accumulated values are halved every half_life + * with each period starting when the current time % half_life is 0. Zeroing is + * enough for initialization. + * + * See ravg_accumulate() and ravg_read() for more details. + */ +struct ravg_data { + /* current value */ + u64 val; + + /* + * The timestamp of @val. The latest completed seq #: + * + * (val_at / half_life) - 1 + */ + u64 val_at; + + /* running avg as of the latest completed seq */ + u64 old; + + /* + * Accumulated value of the current period. Input value is 48bits and we + * normalize half-life to 16bit, so it should fit in an u64. + */ + u64 cur; +}; + +#endif /* __SCX_RAVG_BPF_H__ */ diff --git a/tools/sched_ext/include/scx/ravg_impl.bpf.h b/tools/sched_ext/include/scx/ravg_impl.bpf.h new file mode 100644 index 0000000000000..4922a3e689bc8 --- /dev/null +++ b/tools/sched_ext/include/scx/ravg_impl.bpf.h @@ -0,0 +1,358 @@ +/* to be included in the main bpf.c file */ +#include "ravg.bpf.h" + +#define RAVG_FN_ATTRS inline __attribute__((unused, always_inline)) + +static RAVG_FN_ATTRS void ravg_add(u64 *sum, u64 addend) +{ + u64 new = *sum + addend; + + if (new >= *sum) + *sum = new; + else + *sum = -1; +} + +static RAVG_FN_ATTRS u64 ravg_decay(u64 v, u32 shift) +{ + if (shift >= 64) + return 0; + else + return v >> shift; +} + +static RAVG_FN_ATTRS u32 ravg_normalize_dur(u32 dur, u32 half_life) +{ + if (dur < half_life) + return (((u64)dur << RAVG_FRAC_BITS) + half_life - 1) / + half_life; + else + return 1 << RAVG_FRAC_BITS; +} + +/* + * Pre-computed decayed full-period values. This is quicker and keeps the bpf + * verifier happy by removing the need for looping. + * + * [0] = ravg_decay(1 << RAVG_FRAC_BITS, 1) + * [1] = [0] + ravg_decay(1 << RAVG_FRAC_BITS, 2) + * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3) + * ... + */ +static u64 ravg_full_sum[] = { + 524288, 786432, 917504, 983040, + 1015808, 1032192, 1040384, 1044480, + 1046528, 1047552, 1048064, 1048320, + 1048448, 1048512, 1048544, 1048560, + 1048568, 1048572, 1048574, 1048575, + /* the same from here on */ +}; + +static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_sum[0]); + +/** + * ravg_accumulate - Accumulate a new value + * @rd: ravg_data to accumulate into + * @new_val: new value + * @now: current timestamp + * @half_life: decay period, must be the same across calls + * + * The current value is changing to @val at @now. Accumulate accordingly. + */ +static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd, u64 new_val, u64 now, + u32 half_life) +{ + u32 cur_seq, val_seq, seq_delta; + + /* + * It may be difficult for the caller to guarantee monotonic progress if + * multiple CPUs accumulate to the same ravg_data. Handle @now being in + * the past of @rd->val_at. + */ + if (now < rd->val_at) + now = rd->val_at; + + cur_seq = now / half_life; + val_seq = rd->val_at / half_life; + seq_delta = cur_seq - val_seq; + + /* + * Decay ->old and fold ->cur into it. + * + * @end + * v + * timeline |---------|---------|---------|---------|---------| + * seq delta 4 3 2 1 0 + * seq ->seq cur_seq + * val ->old ->cur ^ + * | | | + * \---------+------------------/ + */ + if (seq_delta > 0) { + /* decay ->old to bring it upto the cur_seq - 1 */ + rd->old = ravg_decay(rd->old, seq_delta); + /* non-zero ->cur must be from val_seq, calc and fold */ + ravg_add(&rd->old, ravg_decay(rd->cur, seq_delta)); + /* clear */ + rd->cur = 0; + } + + if (!rd->val) + goto out; + + /* + * Accumulate @rd->val between @rd->val_at and @now. + * + * @rd->val_at @now + * v v + * timeline |---------|---------|---------|---------|---------| + * seq delta [ 3 | 2 | 1 | 0 ] + */ + if (seq_delta > 0) { + u32 dur; + + /* fold the oldest period which may be partial */ + dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life); + ravg_add(&rd->old, rd->val * ravg_decay(dur, seq_delta)); + + /* fold the full periods in the middle with precomputed vals */ + if (seq_delta > 1) { + u32 idx = seq_delta - 2; + + if (idx >= ravg_full_sum_len) + idx = ravg_full_sum_len - 1; + + ravg_add(&rd->old, rd->val * ravg_full_sum[idx]); + } + + /* accumulate the current period duration into ->cur */ + rd->cur += rd->val * ravg_normalize_dur(now % half_life, + half_life); + } else { + rd->cur += rd->val * ravg_normalize_dur(now - rd->val_at, + half_life); + } +out: + if (new_val >= 1LLU << RAVG_VAL_BITS) + rd->val = (1LLU << RAVG_VAL_BITS) - 1; + else + rd->val = new_val; + rd->val_at = now; +} + +/** + * ravg_transfer - Transfer in or out a component running avg + * @base: ravg_data to transfer @xfer into or out of + * @base_new_val: new value for @base + * @xfer: ravg_data to transfer + * @xfer_new_val: new value for @xfer + * @is_xfer_in: transfer direction + * + * An ravg may be a sum of component ravgs. For example, a scheduling domain's + * load is the sum of the load values of all member tasks. If a task is migrated + * to a different domain, its contribution should be subtracted from the source + * ravg and added to the destination one. + * + * This function can be used for such component transfers. Both @base and @xfer + * must have been accumulated at the same timestamp. @xfer's contribution is + * subtracted if @is_fer_in is %false and added if %true. + */ +static RAVG_FN_ATTRS void ravg_transfer(struct ravg_data *base, u64 base_new_val, + struct ravg_data *xfer, u64 xfer_new_val, + u32 half_life, bool is_xfer_in) +{ + /* synchronize @base and @xfer */ + if ((s64)(base->val_at - xfer->val_at) < 0) + ravg_accumulate(base, base_new_val, xfer->val_at, half_life); + else if ((s64)(base->val_at - xfer->val_at) > 0) + ravg_accumulate(xfer, xfer_new_val, base->val_at, half_life); + + /* transfer */ + if (is_xfer_in) { + base->old += xfer->old; + base->cur += xfer->cur; + } else { + if (base->old > xfer->old) + base->old -= xfer->old; + else + base->old = 0; + + if (base->cur > xfer->cur) + base->cur -= xfer->cur; + else + base->cur = 0; + } +} + +/** + * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift) + * @a: multiplicand + * @b: multiplier + * @rshift: number of bits to shift right + * + * Poor man's 128bit arithmetic. Calculate ((@a * @b) >> @rshift) where @a is + * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must + * ensure that the final shifted result fits in u64. + */ +static inline __attribute__((always_inline)) +u64 u64_x_u32_rshift(u64 a, u32 b, u32 rshift) +{ + const u64 mask32 = (u32)-1; + u64 al = a & mask32; + u64 ah = (a & (mask32 << 32)) >> 32; + + /* + * ah: high 32 al: low 32 + * a |--------------||--------------| + * + * ah * b |--------------||--------------| + * al * b |--------------||--------------| + */ + al *= b; + ah *= b; + + /* + * (ah * b) >> rshift |--------------||--------------| + * (al * b) >> rshift |--------------||--------| + * <--------> + * 32 - rshift + */ + al >>= rshift; + if (rshift <= 32) + ah <<= 32 - rshift; + else + ah >>= rshift - 32; + + return al + ah; +} + +/** + * ravg_scale - Scale a running avg + * @rd: ravg_data to scale + * @mult: multipler + * @rshift: right shift amount + * + * Scale @rd by multiplying the tracked values by @mult and shifting right by + * @rshift. + */ +static RAVG_FN_ATTRS void ravg_scale(struct ravg_data *rd, u32 mult, u32 rshift) +{ + rd->val = u64_x_u32_rshift(rd->val, mult, rshift); + rd->old = u64_x_u32_rshift(rd->old, mult, rshift); + rd->cur = u64_x_u32_rshift(rd->cur, mult, rshift); +} + +/** + * ravg_read - Read the current running avg + * @rd: ravg_data to read from + * @now: timestamp as of which to read the running avg + * @half_life: decay period, must match ravg_accumulate()'s + * + * Read running avg from @rd as of @now. + */ +static RAVG_FN_ATTRS u64 ravg_read(struct ravg_data *rd, u64 now, u64 half_life) +{ + struct ravg_data trd; + u32 elapsed; + + /* + * It may be difficult for the caller to guarantee monotonic progress if + * multiple CPUs accumulate to the same ravg_data. Handle @now being in + * the past of @rd->val_at. + */ + if (now < rd->val_at) + now = rd->val_at; + + elapsed = now % half_life; + + /* + * Accumulate the ongoing period into a temporary copy. This allows + * external readers to access up-to-date avg without strongly + * synchronizing with the updater (we need to add a seq lock tho). + */ + trd = *rd; + rd = &trd; + ravg_accumulate(rd, 0, now, half_life); + + /* + * At the beginning of a new half_life period, the running avg is the + * same as @rd->old. At the beginning of the next, it'd be old load / 2 + * + current load / 2. Inbetween, we blend the two linearly. + */ + if (elapsed) { + u32 progress = ravg_normalize_dur(elapsed, half_life); + /* + * `H` is the duration of the half-life window, and `E` is how + * much time has elapsed in this window. `P` is [0.0, 1.0] + * representing how much the current window has progressed: + * + * P = E / H + * + * If `old` is @rd->old, we would want to calculate the + * following for blending: + * + * old * (1.0 - P / 2) + * + * Because @progress is [0, 1 << RAVG_FRAC_BITS], let's multiply + * and then divide by 1 << RAVG_FRAC_BITS: + * + * (1 << RAVG_FRAC_BITS) - (1 << RAVG_FRAC_BITS) * P / 2 + * old * ----------------------------------------------------- + * 1 << RAVG_FRAC_BITS + * + * As @progress is (1 << RAVG_FRAC_BITS) * P: + * + * (1 << RAVG_FRAC_BITS) - progress / 2 + * old * ------------------------------------ + * 1 << RAVG_FRAC_BITS + * + * As @rd->old uses full 64bit, the multiplication can overflow, + * but we also know that the final result is gonna be smaller + * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle + * the interim multiplication correctly. + */ + u64 old = u64_x_u32_rshift(rd->old, + (1 << RAVG_FRAC_BITS) - progress / 2, + RAVG_FRAC_BITS); + /* + * If `S` is the Sum(val * duration) for this half-life window, + * the avg for this window is: + * + * S / E + * + * We would want to calculate the following for blending: + * + * S / E * (P / 2) + * + * As P = E / H, + * + * S / E * (E / H / 2) + * S / H / 2 + * + * Expanding S, the above becomes: + * + * Sum(val * duration) / H / 2 + * Sum(val * (duration / H)) / 2 + * + * As we use RAVG_FRAC_BITS bits for fixed point arithmetic, + * let's multiply the whole result accordingly: + * + * (Sum(val * (duration / H)) / 2) * (1 << RAVG_FRAC_BITS) + * + * duration * (1 << RAVG_FRAC_BITS) + * Sum(val * --------------------------------) / 2 + * H + * + * The righthand multiplier inside Sum() is the normalized + * duration returned from ravg_normalize_dur(), so, the whole + * Sum term equals @rd->cur. + * + * rd->cur / 2 + */ + u64 cur = rd->cur / 2; + + return old + cur; + } else { + return rd->old; + } +} diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h new file mode 100644 index 0000000000000..f0e45bf3c7661 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#ifndef __USER_EXIT_INFO_H +#define __USER_EXIT_INFO_H + +struct user_exit_info { + int kind; + char reason[128]; + char msg[1024]; +}; + +#ifdef __bpf__ + +#include "vmlinux.h" +#include + +static inline void uei_record(struct user_exit_info *uei, + const struct scx_exit_info *ei) +{ + bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason); + bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg); + /* use __sync to force memory barrier */ + __sync_val_compare_and_swap(&uei->kind, uei->kind, ei->kind); +} + +#else /* !__bpf__ */ + +static inline bool uei_exited(struct user_exit_info *uei) +{ + /* use __sync to force memory barrier */ + return __sync_val_compare_and_swap(&uei->kind, -1, -1); +} + +static inline void uei_print(const struct user_exit_info *uei) +{ + fprintf(stderr, "EXIT: %s", uei->reason); + if (uei->msg[0] != '\0') + fprintf(stderr, " (%s)", uei->msg); + fputs("\n", stderr); +} + +#endif /* __bpf__ */ +#endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c new file mode 100644 index 0000000000000..51ddb0a14bc61 --- /dev/null +++ b/tools/sched_ext/scx_central.bpf.c @@ -0,0 +1,367 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A central FIFO sched_ext scheduler which demonstrates the followings: + * + * a. Making all scheduling decisions from one CPU: + * + * The central CPU is the only one making scheduling decisions. All other + * CPUs kick the central CPU when they run out of tasks to run. + * + * There is one global BPF queue and the central CPU schedules all CPUs by + * dispatching from the global queue to each CPU's local dsq from dispatch(). + * This isn't the most straightforward. e.g. It'd be easier to bounce + * through per-CPU BPF queues. The current design is chosen to maximally + * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. + * + * b. Tickless operation + * + * All tasks are dispatched with the infinite slice which allows stopping the + * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full + * parameter. The tickless operation can be observed through + * /proc/interrupts. + * + * Periodic switching is enforced by a periodic timer checking all CPUs and + * preempting them as necessary. Unfortunately, BPF timer currently doesn't + * have a way to pin to a specific CPU, so the periodic timer isn't pinned to + * the central CPU. + * + * c. Preemption + * + * Kthreads are unconditionally queued to the head of a matching local dsq + * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always + * prioritized over user threads, which is required for ensuring forward + * progress as e.g. the periodic timer may run on a ksoftirqd and if the + * ksoftirqd gets starved by a user thread, there may not be anything else to + * vacate that user thread. + * + * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the + * next tasks. + * + * This scheduler is designed to maximize usage of various SCX mechanisms. A + * more practical implementation would likely put the scheduling loop outside + * the central CPU's dispatch() path and add some form of priority mechanism. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +enum { + FALLBACK_DSQ_ID = 0, + MS_TO_NS = 1000LLU * 1000, + TIMER_INTERVAL_NS = 1 * MS_TO_NS, +}; + +const volatile bool switch_partial; +const volatile s32 central_cpu; +const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ +const volatile u64 slice_ns = SCX_SLICE_DFL; + +bool timer_pinned = true; +u64 nr_total, nr_locals, nr_queued, nr_lost_pids; +u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; +u64 nr_overflows; + +struct user_exit_info uei; + +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, s32); +} central_q SEC(".maps"); + +/* can't use percpu map due to bad lookups */ +bool RESIZABLE_ARRAY(data, cpu_gimme_task); +u64 RESIZABLE_ARRAY(data, cpu_started_at); + +struct central_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct central_timer); +} central_timer SEC(".maps"); + +static bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* + * Steer wakeups to the central CPU as much as possible to avoid + * disturbing other CPUs. It's safe to blindly return the central cpu as + * select_cpu() is a hint and if @p can't be on it, the kernel will + * automatically pick a fallback CPU. + */ + return central_cpu; +} + +void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) +{ + s32 pid = p->pid; + + __sync_fetch_and_add(&nr_total, 1); + + /* + * Push per-cpu kthreads at the head of local dsq's and preempt the + * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked + * behind other threads which is necessary for forward progress + * guarantee as we depend on the BPF timer which may run from ksoftirqd. + */ + if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { + __sync_fetch_and_add(&nr_locals, 1); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, + enq_flags | SCX_ENQ_PREEMPT); + return; + } + + if (bpf_map_push_elem(¢ral_q, &pid, 0)) { + __sync_fetch_and_add(&nr_overflows, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_queued, 1); + + if (!scx_bpf_task_running(p)) + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); +} + +static bool dispatch_to_cpu(s32 cpu) +{ + struct task_struct *p; + s32 pid; + + bpf_repeat(BPF_MAX_LOOPS) { + if (bpf_map_pop_elem(¢ral_q, &pid)) + break; + + __sync_fetch_and_sub(&nr_queued, 1); + + p = bpf_task_from_pid(pid); + if (!p) { + __sync_fetch_and_add(&nr_lost_pids, 1); + continue; + } + + /* + * If we can't run the task at the top, do the dumb thing and + * bounce it to the fallback dsq. + */ + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + __sync_fetch_and_add(&nr_mismatches, 1); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); + bpf_task_release(p); + /* + * We might run out of dispatch buffer slots if we continue dispatching + * to the fallback DSQ, without dispatching to the local DSQ of the + * target CPU. In such a case, break the loop now as will fail the + * next dispatch operation. + */ + if (!scx_bpf_dispatch_nr_slots()) + break; + continue; + } + + /* dispatch to local and mark that @cpu doesn't need more */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); + + if (cpu != central_cpu) + scx_bpf_kick_cpu(cpu, 0); + + bpf_task_release(p); + return true; + } + + return false; +} + +void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) +{ + if (cpu == central_cpu) { + /* dispatch for all other CPUs first */ + __sync_fetch_and_add(&nr_dispatches, 1); + + bpf_for(cpu, 0, nr_cpu_ids) { + bool *gimme; + + if (!scx_bpf_dispatch_nr_slots()) + break; + + /* central's gimme is never set */ + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme && !*gimme) + continue; + + if (dispatch_to_cpu(cpu)) + *gimme = false; + } + + /* + * Retry if we ran out of dispatch buffer slots as we might have + * skipped some CPUs and also need to dispatch for self. The ext + * core automatically retries if the local dsq is empty but we + * can't rely on that as we're dispatching for other CPUs too. + * Kick self explicitly to retry. + */ + if (!scx_bpf_dispatch_nr_slots()) { + __sync_fetch_and_add(&nr_retries, 1); + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + return; + } + + /* look for a task to run on the central CPU */ + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + dispatch_to_cpu(central_cpu); + } else { + bool *gimme; + + if (scx_bpf_consume(FALLBACK_DSQ_ID)) + return; + + gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); + if (gimme) + *gimme = true; + + /* + * Force dispatch on the scheduling CPU so that it finds a task + * to run for us. + */ + scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); + } +} + +void BPF_STRUCT_OPS(central_running, struct task_struct *p) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ +} + +void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at) + *started_at = 0; +} + +static int central_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + u64 now = bpf_ktime_get_ns(); + u64 nr_to_kick = nr_queued; + s32 i, curr_cpu; + + curr_cpu = bpf_get_smp_processor_id(); + if (timer_pinned && (curr_cpu != central_cpu)) { + scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", + curr_cpu, central_cpu); + return 0; + } + + bpf_for(i, 0, nr_cpu_ids) { + s32 cpu = (nr_timers + i) % nr_cpu_ids; + u64 *started_at; + + if (cpu == central_cpu) + continue; + + /* kick iff the current one exhausted its slice */ + started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); + if (started_at && *started_at && + vtime_before(now, *started_at + slice_ns)) + continue; + + /* and there's something pending */ + if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || + scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) + ; + else if (nr_to_kick) + nr_to_kick--; + else + continue; + + scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); + } + + bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + __sync_fetch_and_add(&nr_timers, 1); + return 0; +} + +int BPF_STRUCT_OPS_SLEEPABLE(central_init) +{ + u32 key = 0; + struct bpf_timer *timer; + int ret; + + if (!switch_partial) + scx_bpf_switch_all(); + + ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); + if (ret) + return ret; + + timer = bpf_map_lookup_elem(¢ral_timer, &key); + if (!timer) + return -ESRCH; + + if (bpf_get_smp_processor_id() != central_cpu) { + scx_bpf_error("init from non-central CPU"); + return -EINVAL; + } + + bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, central_timerfn); + + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); + /* + * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a + * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. + * Retry without the PIN. This would be the perfect use case for + * bpf_core_enum_value_exists() but the enum type doesn't have a name + * and can't be used with bpf_core_enum_value_exists(). Oh well... + */ + if (ret == -EINVAL) { + timer_pinned = false; + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); + } + if (ret) + scx_bpf_error("bpf_timer_start failed (%d)", ret); + return ret; +} + +void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops central_ops = { + /* + * We are offloading all scheduling decisions to the central CPU and + * thus being the last task on a given CPU doesn't mean anything + * special. Enqueue the last tasks like any other tasks. + */ + .flags = SCX_OPS_ENQ_LAST, + + .select_cpu = (void *)central_select_cpu, + .enqueue = (void *)central_enqueue, + .dispatch = (void *)central_dispatch, + .running = (void *)central_running, + .stopping = (void *)central_stopping, + .init = (void *)central_init, + .exit = (void *)central_exit, + .name = "central", +}; diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c new file mode 100644 index 0000000000000..501505001bf98 --- /dev/null +++ b/tools/sched_ext/scx_central.c @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_central.bpf.skel.h" + +const char help_fmt[] = +"A central FIFO sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-c CPU] [-p]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -c CPU Override the central CPU (default: 0)\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_central *skel; + struct bpf_link *link; + __u64 seq = 0; + __s32 opt; + cpu_set_t *cpuset; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_central__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + skel->rodata->central_cpu = 0; + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "s:c:ph")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'c': + skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + break; + case 'p': + skel->rodata->switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + /* Resize arrays so their element count is equal to cpu count. */ + RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids); + + SCX_BUG_ON(scx_central__load(skel), "Failed to load skel"); + + /* + * Affinitize the loading thread to the central CPU, as: + * - That's where the BPF timer is first invoked in the BPF program. + * - We probably don't want this user space component to take up a core + * from a task that would benefit from avoiding preemption on one of + * the tickless cores. + * + * Until BPF supports pinning the timer, it's not guaranteed that it + * will always be invoked on the central CPU. In practice, this + * suffices the majority of the time. + */ + cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); + SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); + CPU_ZERO(cpuset); + CPU_SET(skel->rodata->central_cpu, cpuset); + SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), + "Failed to affinitize to central CPU %d (max %d)", + skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); + CPU_FREE(cpuset); + + link = bpf_map__attach_struct_ops(skel->maps.central_ops); + SCX_BUG_ON(!link, "Failed to attach struct_ops"); + + if (!skel->data->timer_pinned) + printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); + + while (!exit_req && !uei_exited(&skel->bss->uei)) { + printf("[SEQ %llu]\n", seq++); + printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", + skel->bss->nr_total, + skel->bss->nr_locals, + skel->bss->nr_queued, + skel->bss->nr_lost_pids); + printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", + skel->bss->nr_timers, + skel->bss->nr_dispatches, + skel->bss->nr_mismatches, + skel->bss->nr_retries); + printf("overflow:%10" PRIu64 "\n", + skel->bss->nr_overflows); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + uei_print(&skel->bss->uei); + scx_central__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c new file mode 100644 index 0000000000000..d6a947bc98151 --- /dev/null +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -0,0 +1,960 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext flattened cgroup hierarchy scheduler. It implements + * hierarchical weight-based cgroup CPU control by flattening the cgroup + * hierarchy into a single layer by compounding the active weight share at each + * level. Consider the following hierarchy with weights in parentheses: + * + * R + A (100) + B (100) + * | \ C (100) + * \ D (200) + * + * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. + * Let's say all three have runnable tasks. The total share that each of these + * three cgroups is entitled to can be calculated by compounding its share at + * each level. + * + * For example, B is competing against C and in that competition its share is + * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's + * share in that competition is 200/(200+100) == 1/3. B's eventual share in the + * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's + * eventual shaer is the same at 1/6. D is only competing at the top level and + * its share is 200/(100+200) == 2/3. + * + * So, instead of hierarchically scheduling level-by-level, we can consider it + * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 + * and keep updating the eventual shares as the cgroups' runnable states change. + * + * This flattening of hierarchy can bring a substantial performance gain when + * the cgroup hierarchy is nested multiple levels. in a simple benchmark using + * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it + * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two + * apache instances competing with 2:1 weight ratio nested four level deep. + * + * However, the gain comes at the cost of not being able to properly handle + * thundering herd of cgroups. For example, if many cgroups which are nested + * behind a low priority parent cgroup wake up around the same time, they may be + * able to consume more CPU cycles than they are entitled to. In many use cases, + * this isn't a real concern especially given the performance gain. Also, there + * are ways to mitigate the problem further by e.g. introducing an extra + * scheduling layer on cgroup delegation boundaries. + * + * The scheduler first picks the cgroup to run and then schedule the tasks + * within by using nested weighted vtime scheduling by default. The + * cgroup-internal scheduling can be switched to FIFO with the -f option. + */ +#include +#include "scx_flatcg.h" + +/* + * Maximum amount of retries to find a valid cgroup. + */ +#define CGROUP_MAX_RETRIES 1024 + +char _license[] SEC("license") = "GPL"; + +const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ +const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; +const volatile bool fifo_sched; +const volatile bool switch_partial; + +u64 cvtime_now; +struct user_exit_info uei; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, u64); + __uint(max_entries, FCG_NR_STATS); +} stats SEC(".maps"); + +static void stat_inc(enum fcg_stat_idx idx) +{ + u32 idx_v = idx; + + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); + if (cnt_p) + (*cnt_p)++; +} + +struct fcg_cpu_ctx { + u64 cur_cgid; + u64 cur_at; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct fcg_cpu_ctx); + __uint(max_entries, 1); +} cpu_ctx SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct fcg_cgrp_ctx); +} cgrp_ctx SEC(".maps"); + +struct cgv_node { + struct bpf_rb_node rb_node; + __u64 cvtime; + __u64 cgid; +}; + +private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; +private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); + +struct cgv_node_stash { + struct cgv_node __kptr *node; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 16384); + __type(key, __u64); + __type(value, struct cgv_node_stash); +} cgv_node_stash SEC(".maps"); + +struct fcg_task_ctx { + u64 bypassed_at; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct fcg_task_ctx); +} task_ctx SEC(".maps"); + +/* gets inc'd on weight tree changes to expire the cached hweights */ +u64 hweight_gen = 1; + +static u64 div_round_up(u64 dividend, u64 divisor) +{ + return (dividend + divisor - 1) / divisor; +} + +static bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct cgv_node *cgc_a, *cgc_b; + + cgc_a = container_of(a, struct cgv_node, rb_node); + cgc_b = container_of(b, struct cgv_node, rb_node); + + return cgc_a->cvtime < cgc_b->cvtime; +} + +static struct fcg_cpu_ctx *find_cpu_ctx(void) +{ + struct fcg_cpu_ctx *cpuc; + u32 idx = 0; + + cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); + if (!cpuc) { + scx_bpf_error("cpu_ctx lookup failed"); + return NULL; + } + return cpuc; +} + +static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) +{ + struct fcg_cgrp_ctx *cgc; + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (!cgc) { + scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); + return NULL; + } + return cgc; +} + +static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) +{ + struct fcg_cgrp_ctx *cgc; + + cgrp = bpf_cgroup_ancestor(cgrp, level); + if (!cgrp) { + scx_bpf_error("ancestor cgroup lookup failed"); + return NULL; + } + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + scx_bpf_error("ancestor cgrp_ctx lookup failed"); + bpf_cgroup_release(cgrp); + return cgc; +} + +static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) +{ + int level; + + if (!cgc->nr_active) { + stat_inc(FCG_STAT_HWT_SKIP); + return; + } + + if (cgc->hweight_gen == hweight_gen) { + stat_inc(FCG_STAT_HWT_CACHE); + return; + } + + stat_inc(FCG_STAT_HWT_UPDATES); + bpf_for(level, 0, cgrp->level + 1) { + struct fcg_cgrp_ctx *cgc; + bool is_active; + + cgc = find_ancestor_cgrp_ctx(cgrp, level); + if (!cgc) + break; + + if (!level) { + cgc->hweight = FCG_HWEIGHT_ONE; + cgc->hweight_gen = hweight_gen; + } else { + struct fcg_cgrp_ctx *pcgc; + + pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); + if (!pcgc) + break; + + /* + * We can be oppotunistic here and not grab the + * cgv_tree_lock and deal with the occasional races. + * However, hweight updates are already cached and + * relatively low-frequency. Let's just do the + * straightforward thing. + */ + bpf_spin_lock(&cgv_tree_lock); + is_active = cgc->nr_active; + if (is_active) { + cgc->hweight_gen = pcgc->hweight_gen; + cgc->hweight = + div_round_up(pcgc->hweight * cgc->weight, + pcgc->child_weight_sum); + } + bpf_spin_unlock(&cgv_tree_lock); + + if (!is_active) { + stat_inc(FCG_STAT_HWT_RACE); + break; + } + } + } +} + +static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) +{ + u64 delta, cvtime, max_budget; + + /* + * A node which is on the rbtree can't be pointed to from elsewhere yet + * and thus can't be updated and repositioned. Instead, we collect the + * vtime deltas separately and apply it asynchronously here. + */ + delta = cgc->cvtime_delta; + __sync_fetch_and_sub(&cgc->cvtime_delta, delta); + cvtime = cgv_node->cvtime + delta; + + /* + * Allow a cgroup to carry the maximum budget proportional to its + * hweight such that a full-hweight cgroup can immediately take up half + * of the CPUs at the most while staying at the front of the rbtree. + */ + max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / + (2 * FCG_HWEIGHT_ONE); + if (vtime_before(cvtime, cvtime_now - max_budget)) + cvtime = cvtime_now - max_budget; + + cgv_node->cvtime = cvtime; +} + +static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) +{ + struct cgv_node_stash *stash; + struct cgv_node *cgv_node; + u64 cgid = cgrp->kn->id; + + /* paired with cmpxchg in try_pick_next_cgroup() */ + if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { + stat_inc(FCG_STAT_ENQ_SKIP); + return; + } + + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash) { + scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); + return; + } + + /* NULL if the node is already on the rbtree */ + cgv_node = bpf_kptr_xchg(&stash->node, NULL); + if (!cgv_node) { + stat_inc(FCG_STAT_ENQ_RACE); + return; + } + + bpf_spin_lock(&cgv_tree_lock); + cgrp_cap_budget(cgv_node, cgc); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); +} + +static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) +{ + /* + * Tell fcg_stopping() that this bypassed the regular scheduling path + * and should be force charged to the cgroup. 0 is used to indicate that + * the task isn't bypassing, so if the current runtime is 0, go back by + * one nanosecond. + */ + taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; +} + +s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + struct fcg_task_ctx *taskc; + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return cpu; + } + + /* + * If select_cpu_dfl() is recommending local enqueue, the target CPU is + * idle. Follow it and charge the cgroup later in fcg_stopping() after + * the fact. + */ + if (is_idle) { + set_bypassed_at(p, taskc); + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct fcg_task_ctx *taskc; + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* + * Use the direct dispatching and force charging to deal with tasks with + * custom affinities so that we don't have to worry about per-cgroup + * dq's containing tasks that can't be executed from some CPUs. + */ + if (p->nr_cpus_allowed != nr_cpus) { + set_bypassed_at(p, taskc); + + /* + * The global dq is deprioritized as we don't want to let tasks + * to boost themselves by constraining its cpumask. The + * deprioritization is rather severe, so let's not apply that to + * per-cpu kernel threads. This is ham-fisted. We probably wanna + * implement per-cgroup fallback dq's instead so that we have + * more control over when tasks with custom cpumask get issued. + */ + if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { + stat_inc(FCG_STAT_LOCAL); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); + } else { + stat_inc(FCG_STAT_GLOBAL); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } + return; + } + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + goto out_release; + + if (fifo_sched) { + scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); + } else { + u64 tvtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) + tvtime = cgc->tvtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, + tvtime, enq_flags); + } + + cgrp_enqueued(cgrp, cgc); +out_release: + bpf_cgroup_release(cgrp); +} + +/* + * Walk the cgroup tree to update the active weight sums as tasks wake up and + * sleep. The weight sums are used as the base when calculating the proportion a + * given cgroup or task is entitled to at each level. + */ +static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) +{ + struct fcg_cgrp_ctx *cgc; + bool updated = false; + int idx; + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + return; + + /* + * In most cases, a hot cgroup would have multiple threads going to + * sleep and waking up while the whole cgroup stays active. In leaf + * cgroups, ->nr_runnable which is updated with __sync operations gates + * ->nr_active updates, so that we don't have to grab the cgv_tree_lock + * repeatedly for a busy cgroup which is staying active. + */ + if (runnable) { + if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) + return; + stat_inc(FCG_STAT_ACT); + } else { + if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) + return; + stat_inc(FCG_STAT_DEACT); + } + + /* + * If @cgrp is becoming runnable, its hweight should be refreshed after + * it's added to the weight tree so that enqueue has the up-to-date + * value. If @cgrp is becoming quiescent, the hweight should be + * refreshed before it's removed from the weight tree so that the usage + * charging which happens afterwards has access to the latest value. + */ + if (!runnable) + cgrp_refresh_hweight(cgrp, cgc); + + /* propagate upwards */ + bpf_for(idx, 0, cgrp->level) { + int level = cgrp->level - idx; + struct fcg_cgrp_ctx *cgc, *pcgc = NULL; + bool propagate = false; + + cgc = find_ancestor_cgrp_ctx(cgrp, level); + if (!cgc) + break; + if (level) { + pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); + if (!pcgc) + break; + } + + /* + * We need the propagation protected by a lock to synchronize + * against weight changes. There's no reason to drop the lock at + * each level but bpf_spin_lock() doesn't want any function + * calls while locked. + */ + bpf_spin_lock(&cgv_tree_lock); + + if (runnable) { + if (!cgc->nr_active++) { + updated = true; + if (pcgc) { + propagate = true; + pcgc->child_weight_sum += cgc->weight; + } + } + } else { + if (!--cgc->nr_active) { + updated = true; + if (pcgc) { + propagate = true; + pcgc->child_weight_sum -= cgc->weight; + } + } + } + + bpf_spin_unlock(&cgv_tree_lock); + + if (!propagate) + break; + } + + if (updated) + __sync_fetch_and_add(&hweight_gen, 1); + + if (runnable) + cgrp_refresh_hweight(cgrp, cgc); +} + +void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) +{ + struct cgroup *cgrp; + + cgrp = scx_bpf_task_cgroup(p); + update_active_weight_sums(cgrp, true); + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) +{ + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + if (fifo_sched) + return; + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (cgc) { + /* + * @cgc->tvtime_now always progresses forward as tasks start + * executing. The test and update can be performed concurrently + * from multiple CPUs and thus racy. Any error should be + * contained and temporary. Let's just live with it. + */ + if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) + cgc->tvtime_now = p->scx.dsq_vtime; + } + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) +{ + struct fcg_task_ctx *taskc; + struct cgroup *cgrp; + struct fcg_cgrp_ctx *cgc; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + if (!fifo_sched) + p->scx.dsq_vtime += + (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; + + taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); + if (!taskc) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (!taskc->bypassed_at) + return; + + cgrp = scx_bpf_task_cgroup(p); + cgc = find_cgrp_ctx(cgrp); + if (cgc) { + __sync_fetch_and_add(&cgc->cvtime_delta, + p->se.sum_exec_runtime - taskc->bypassed_at); + taskc->bypassed_at = 0; + } + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) +{ + struct cgroup *cgrp; + + cgrp = scx_bpf_task_cgroup(p); + update_active_weight_sums(cgrp, false); + bpf_cgroup_release(cgrp); +} + +void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) +{ + struct fcg_cgrp_ctx *cgc, *pcgc = NULL; + + cgc = find_cgrp_ctx(cgrp); + if (!cgc) + return; + + if (cgrp->level) { + pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); + if (!pcgc) + return; + } + + bpf_spin_lock(&cgv_tree_lock); + if (pcgc && cgc->nr_active) + pcgc->child_weight_sum += (s64)weight - cgc->weight; + cgc->weight = weight; + bpf_spin_unlock(&cgv_tree_lock); +} + +static bool try_pick_next_cgroup(u64 *cgidp) +{ + struct bpf_rb_node *rb_node; + struct cgv_node_stash *stash; + struct cgv_node *cgv_node; + struct fcg_cgrp_ctx *cgc; + struct cgroup *cgrp; + u64 cgid; + + /* pop the front cgroup and wind cvtime_now accordingly */ + bpf_spin_lock(&cgv_tree_lock); + + rb_node = bpf_rbtree_first(&cgv_tree); + if (!rb_node) { + bpf_spin_unlock(&cgv_tree_lock); + stat_inc(FCG_STAT_PNC_NO_CGRP); + *cgidp = 0; + return true; + } + + rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); + bpf_spin_unlock(&cgv_tree_lock); + + if (!rb_node) { + /* + * This should never happen. bpf_rbtree_first() was called + * above while the tree lock was held, so the node should + * always be present. + */ + scx_bpf_error("node could not be removed"); + return true; + } + + cgv_node = container_of(rb_node, struct cgv_node, rb_node); + cgid = cgv_node->cgid; + + if (vtime_before(cvtime_now, cgv_node->cvtime)) + cvtime_now = cgv_node->cvtime; + + /* + * If lookup fails, the cgroup's gone. Free and move on. See + * fcg_cgroup_exit(). + */ + cgrp = bpf_cgroup_from_id(cgid); + if (!cgrp) { + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (!cgc) { + bpf_cgroup_release(cgrp); + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + if (!scx_bpf_consume(cgid)) { + bpf_cgroup_release(cgrp); + stat_inc(FCG_STAT_PNC_EMPTY); + goto out_stash; + } + + /* + * Successfully consumed from the cgroup. This will be our current + * cgroup for the new slice. Refresh its hweight. + */ + cgrp_refresh_hweight(cgrp, cgc); + + bpf_cgroup_release(cgrp); + + /* + * As the cgroup may have more tasks, add it back to the rbtree. Note + * that here we charge the full slice upfront and then exact later + * according to the actual consumption. This prevents lowpri thundering + * herd from saturating the machine. + */ + bpf_spin_lock(&cgv_tree_lock); + cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); + cgrp_cap_budget(cgv_node, cgc); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); + + *cgidp = cgid; + stat_inc(FCG_STAT_PNC_NEXT); + return true; + +out_stash: + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash) { + stat_inc(FCG_STAT_PNC_GONE); + goto out_free; + } + + /* + * Paired with cmpxchg in cgrp_enqueued(). If they see the following + * transition, they'll enqueue the cgroup. If they are earlier, we'll + * see their task in the dq below and requeue the cgroup. + */ + __sync_val_compare_and_swap(&cgc->queued, 1, 0); + + if (scx_bpf_dsq_nr_queued(cgid)) { + bpf_spin_lock(&cgv_tree_lock); + bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); + bpf_spin_unlock(&cgv_tree_lock); + stat_inc(FCG_STAT_PNC_RACE); + } else { + cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); + if (cgv_node) { + scx_bpf_error("unexpected !NULL cgv_node stash"); + goto out_free; + } + } + + return false; + +out_free: + bpf_obj_drop(cgv_node); + return false; +} + +void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) +{ + struct fcg_cpu_ctx *cpuc; + struct fcg_cgrp_ctx *cgc; + struct cgroup *cgrp; + u64 now = bpf_ktime_get_ns(); + bool picked_next = false; + + cpuc = find_cpu_ctx(); + if (!cpuc) + return; + + if (!cpuc->cur_cgid) + goto pick_next_cgroup; + + if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { + if (scx_bpf_consume(cpuc->cur_cgid)) { + stat_inc(FCG_STAT_CNS_KEEP); + return; + } + stat_inc(FCG_STAT_CNS_EMPTY); + } else { + stat_inc(FCG_STAT_CNS_EXPIRE); + } + + /* + * The current cgroup is expiring. It was already charged a full slice. + * Calculate the actual usage and accumulate the delta. + */ + cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); + if (!cgrp) { + stat_inc(FCG_STAT_CNS_GONE); + goto pick_next_cgroup; + } + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); + if (cgc) { + /* + * We want to update the vtime delta and then look for the next + * cgroup to execute but the latter needs to be done in a loop + * and we can't keep the lock held. Oh well... + */ + bpf_spin_lock(&cgv_tree_lock); + __sync_fetch_and_add(&cgc->cvtime_delta, + (cpuc->cur_at + cgrp_slice_ns - now) * + FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); + bpf_spin_unlock(&cgv_tree_lock); + } else { + stat_inc(FCG_STAT_CNS_GONE); + } + + bpf_cgroup_release(cgrp); + +pick_next_cgroup: + cpuc->cur_at = now; + + if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { + cpuc->cur_cgid = 0; + return; + } + + bpf_repeat(CGROUP_MAX_RETRIES) { + if (try_pick_next_cgroup(&cpuc->cur_cgid)) { + picked_next = true; + break; + } + } + + /* + * This only happens if try_pick_next_cgroup() races against enqueue + * path for more than CGROUP_MAX_RETRIES times, which is extremely + * unlikely and likely indicates an underlying bug. There shouldn't be + * any stall risk as the race is against enqueue. + */ + if (!picked_next) + stat_inc(FCG_STAT_PNC_FAIL); +} + +s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct fcg_task_ctx *taskc; + struct fcg_cgrp_ctx *cgc; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + taskc = bpf_task_storage_get(&task_ctx, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!taskc) + return -ENOMEM; + + taskc->bypassed_at = 0; + + if (!(cgc = find_cgrp_ctx(args->cgroup))) + return -ENOENT; + + p->scx.dsq_vtime = cgc->tvtime_now; + + return 0; +} + +int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, + struct scx_cgroup_init_args *args) +{ + struct fcg_cgrp_ctx *cgc; + struct cgv_node *cgv_node; + struct cgv_node_stash empty_stash = {}, *stash; + u64 cgid = cgrp->kn->id; + int ret; + + /* + * Technically incorrect as cgroup ID is full 64bit while dq ID is + * 63bit. Should not be a problem in practice and easy to spot in the + * unlikely case that it breaks. + */ + ret = scx_bpf_create_dsq(cgid, -1); + if (ret) + return ret; + + cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!cgc) { + ret = -ENOMEM; + goto err_destroy_dsq; + } + + cgc->weight = args->weight; + cgc->hweight = FCG_HWEIGHT_ONE; + + ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, + BPF_NOEXIST); + if (ret) { + if (ret != -ENOMEM) + scx_bpf_error("unexpected stash creation error (%d)", + ret); + goto err_destroy_dsq; + } + + stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); + if (!stash) { + scx_bpf_error("unexpected cgv_node stash lookup failure"); + ret = -ENOENT; + goto err_destroy_dsq; + } + + cgv_node = bpf_obj_new(struct cgv_node); + if (!cgv_node) { + ret = -ENOMEM; + goto err_del_cgv_node; + } + + cgv_node->cgid = cgid; + cgv_node->cvtime = cvtime_now; + + cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); + if (cgv_node) { + scx_bpf_error("unexpected !NULL cgv_node stash"); + ret = -EBUSY; + goto err_drop; + } + + return 0; + +err_drop: + bpf_obj_drop(cgv_node); +err_del_cgv_node: + bpf_map_delete_elem(&cgv_node_stash, &cgid); +err_destroy_dsq: + scx_bpf_destroy_dsq(cgid); + return ret; +} + +void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + + /* + * For now, there's no way find and remove the cgv_node if it's on the + * cgv_tree. Let's drain them in the dispatch path as they get popped + * off the front of the tree. + */ + bpf_map_delete_elem(&cgv_node_stash, &cgid); + scx_bpf_destroy_dsq(cgid); +} + +void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, + struct cgroup *from, struct cgroup *to) +{ + struct fcg_cgrp_ctx *from_cgc, *to_cgc; + s64 vtime_delta; + + /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ + if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) + return; + + vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; + p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; +} + +s32 BPF_STRUCT_OPS(fcg_init) +{ + if (!switch_partial) + scx_bpf_switch_all(); + return 0; +} + +void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops flatcg_ops = { + .select_cpu = (void *)fcg_select_cpu, + .enqueue = (void *)fcg_enqueue, + .dispatch = (void *)fcg_dispatch, + .runnable = (void *)fcg_runnable, + .running = (void *)fcg_running, + .stopping = (void *)fcg_stopping, + .quiescent = (void *)fcg_quiescent, + .init_task = (void *)fcg_init_task, + .cgroup_set_weight = (void *)fcg_cgroup_set_weight, + .cgroup_init = (void *)fcg_cgroup_init, + .cgroup_exit = (void *)fcg_cgroup_exit, + .cgroup_move = (void *)fcg_cgroup_move, + .init = (void *)fcg_init, + .exit = (void *)fcg_exit, + .flags = SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING, + .name = "flatcg", +}; diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c new file mode 100644 index 0000000000000..6c2f9715f6925 --- /dev/null +++ b/tools/sched_ext/scx_flatcg.c @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_flatcg.h" +#include "scx_flatcg.bpf.skel.h" + +#ifndef FILEID_KERNFS +#define FILEID_KERNFS 0xfe +#endif + +const char help_fmt[] = +"A flattened cgroup hierarchy sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-p]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -i INTERVAL Report interval\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) +{ + FILE *fp; + char buf[4096]; + char *line, *cur = NULL, *tok; + __u64 sum = 0, idle = 0; + __u64 delta_sum, delta_idle; + int idx; + + fp = fopen("/proc/stat", "r"); + if (!fp) { + perror("fopen(\"/proc/stat\")"); + return 0.0; + } + + if (!fgets(buf, sizeof(buf), fp)) { + perror("fgets(\"/proc/stat\")"); + fclose(fp); + return 0.0; + } + fclose(fp); + + line = buf; + for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { + char *endp = NULL; + __u64 v; + + if (idx == 0) { + line = NULL; + continue; + } + v = strtoull(tok, &endp, 0); + if (!endp || *endp != '\0') { + fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", + idx, tok); + continue; + } + sum += v; + if (idx == 4) + idle = v; + } + + delta_sum = sum - *last_sum; + delta_idle = idle - *last_idle; + *last_sum = sum; + *last_idle = idle; + + return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; +} + +static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) +{ + __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); + + for (idx = 0; idx < FCG_NR_STATS; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_flatcg *skel; + struct bpf_link *link; + struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; + bool dump_cgrps = false; + __u64 last_cpu_sum = 0, last_cpu_idle = 0; + __u64 last_stats[FCG_NR_STATS] = {}; + unsigned long seq = 0; + __s32 opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_flatcg__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + + while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) { + double v; + + switch (opt) { + case 's': + v = strtod(optarg, NULL); + skel->rodata->cgrp_slice_ns = v * 1000; + break; + case 'i': + v = strtod(optarg, NULL); + intv_ts.tv_sec = v; + intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; + break; + case 'd': + dump_cgrps = true; + break; + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'p': + skel->rodata->switch_partial = true; + break; + case 'h': + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", + (double)skel->rodata->cgrp_slice_ns / 1000000.0, + (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, + dump_cgrps); + + SCX_BUG_ON(scx_flatcg__load(skel), "Failed to load skel"); + + link = bpf_map__attach_struct_ops(skel->maps.flatcg_ops); + SCX_BUG_ON(!link, "Failed to attach struct_ops"); + + while (!exit_req && !uei_exited(&skel->bss->uei)) { + __u64 acc_stats[FCG_NR_STATS]; + __u64 stats[FCG_NR_STATS]; + float cpu_util; + int i; + + cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); + + fcg_read_stats(skel, acc_stats); + for (i = 0; i < FCG_NR_STATS; i++) + stats[i] = acc_stats[i] - last_stats[i]; + + memcpy(last_stats, acc_stats, sizeof(acc_stats)); + + printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", + seq++, cpu_util * 100.0, skel->data->hweight_gen); + printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", + stats[FCG_STAT_ACT], + stats[FCG_STAT_DEACT], + stats[FCG_STAT_GLOBAL], + stats[FCG_STAT_LOCAL]); + printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", + stats[FCG_STAT_HWT_CACHE], + stats[FCG_STAT_HWT_UPDATES], + stats[FCG_STAT_HWT_SKIP], + stats[FCG_STAT_HWT_RACE]); + printf("ENQ skip:%6llu race:%6llu\n", + stats[FCG_STAT_ENQ_SKIP], + stats[FCG_STAT_ENQ_RACE]); + printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", + stats[FCG_STAT_CNS_KEEP], + stats[FCG_STAT_CNS_EXPIRE], + stats[FCG_STAT_CNS_EMPTY], + stats[FCG_STAT_CNS_GONE]); + printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", + stats[FCG_STAT_PNC_NEXT], + stats[FCG_STAT_PNC_EMPTY], + stats[FCG_STAT_PNC_NO_CGRP], + stats[FCG_STAT_PNC_GONE], + stats[FCG_STAT_PNC_RACE], + stats[FCG_STAT_PNC_FAIL]); + printf("BAD remove:%6llu\n", + acc_stats[FCG_STAT_BAD_REMOVAL]); + fflush(stdout); + + nanosleep(&intv_ts, NULL); + } + + bpf_link__destroy(link); + uei_print(&skel->bss->uei); + scx_flatcg__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h new file mode 100644 index 0000000000000..6f2ea50acb1cb --- /dev/null +++ b/tools/sched_ext/scx_flatcg.h @@ -0,0 +1,51 @@ +#ifndef __SCX_EXAMPLE_FLATCG_H +#define __SCX_EXAMPLE_FLATCG_H + +enum { + FCG_HWEIGHT_ONE = 1LLU << 16, +}; + +enum fcg_stat_idx { + FCG_STAT_ACT, + FCG_STAT_DEACT, + FCG_STAT_LOCAL, + FCG_STAT_GLOBAL, + + FCG_STAT_HWT_UPDATES, + FCG_STAT_HWT_CACHE, + FCG_STAT_HWT_SKIP, + FCG_STAT_HWT_RACE, + + FCG_STAT_ENQ_SKIP, + FCG_STAT_ENQ_RACE, + + FCG_STAT_CNS_KEEP, + FCG_STAT_CNS_EXPIRE, + FCG_STAT_CNS_EMPTY, + FCG_STAT_CNS_GONE, + + FCG_STAT_PNC_NO_CGRP, + FCG_STAT_PNC_NEXT, + FCG_STAT_PNC_EMPTY, + FCG_STAT_PNC_GONE, + FCG_STAT_PNC_RACE, + FCG_STAT_PNC_FAIL, + + FCG_STAT_BAD_REMOVAL, + + FCG_NR_STATS, +}; + +struct fcg_cgrp_ctx { + u32 nr_active; + u32 nr_runnable; + u32 queued; + u32 weight; + u32 hweight; + u64 child_weight_sum; + u64 hweight_gen; + s64 cvtime_delta; + u64 tvtime_now; +}; + +#endif /* __SCX_EXAMPLE_FLATCG_H */ diff --git a/tools/sched_ext/scx_layered/.gitignore b/tools/sched_ext/scx_layered/.gitignore new file mode 100644 index 0000000000000..186dba259ec21 --- /dev/null +++ b/tools/sched_ext/scx_layered/.gitignore @@ -0,0 +1,3 @@ +src/bpf/.output +Cargo.lock +target diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml new file mode 100644 index 0000000000000..37a811e3807e2 --- /dev/null +++ b/tools/sched_ext/scx_layered/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "scx_layered" +version = "0.0.4" +authors = ["Tejun Heo ", "Meta"] +edition = "2021" +description = "Userspace scheduling with BPF for Ads" +license = "GPL-2.0-only" + +[dependencies] +anyhow = "1.0" +bitvec = "1.0" +clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } +ctrlc = { version = "3.1", features = ["termination"] } +fb_procfs = "0.7" +lazy_static = "1.4" +libbpf-rs = "0.22" +libc = "0.2" +log = "0.4" +scx_utils = "0.5" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +simplelog = "0.12" + +[build-dependencies] +scx_utils = "0.5" + +[features] +enable_backtrace = [] diff --git a/tools/sched_ext/scx_layered/README.md b/tools/sched_ext/scx_layered/README.md new file mode 100644 index 0000000000000..37c554b2354db --- /dev/null +++ b/tools/sched_ext/scx_layered/README.md @@ -0,0 +1,37 @@ +# scx_layered + +This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main). + +## Overview + +A highly configurable multi-layer BPF / user space hybrid scheduler. + +scx_layered allows the user to classify tasks into multiple layers, and apply +different scheduling policies to those layers. For example, a layer could be +created of all tasks that are part of the `user.slice` cgroup slice, and a +policy could be specified that ensures that the layer is given at least 80% CPU +utilization for some subset of CPUs on the system. + +## How To Install + +Available as a [Rust crate](https://crates.io/crates/scx_layered): `cargo add scx_layered` + +## Typical Use Case + +scx_layered is designed to be highly customizable, and can be targeted for +specific applications. For example, if you had a high-priority service that +required priority access to all but 1 physical core to ensure acceptable p99 +latencies, you could specify that the service would get priority access to all +but 1 core on the system. If that service ends up not utilizing all of those +cores, they could be used by other layers until they're needed. + +## Production Ready? + +Yes. If tuned correctly, scx_layered should be performant across various CPU +architectures and workloads. + +That said, you may run into an issue with infeasible weights, where a task with +a very high weight may cause the scheduler to incorrectly leave cores idle +because it thinks they're necessary to accommodate the compute for a single +task. This can also happen in CFS, and should soon be addressed for +scx_layered. diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs new file mode 100644 index 0000000000000..d26db839cd9e1 --- /dev/null +++ b/tools/sched_ext/scx_layered/build.rs @@ -0,0 +1,13 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +fn main() { + scx_utils::BpfBuilder::new() + .unwrap() + .enable_intf("src/bpf/intf.h", "bpf_intf.rs") + .enable_skel("src/bpf/main.bpf.c", "bpf") + .build() + .unwrap(); +} diff --git a/tools/sched_ext/scx_layered/rustfmt.toml b/tools/sched_ext/scx_layered/rustfmt.toml new file mode 100644 index 0000000000000..b7258ed0a8d84 --- /dev/null +++ b/tools/sched_ext/scx_layered/rustfmt.toml @@ -0,0 +1,8 @@ +# Get help on options with `rustfmt --help=config` +# Please keep these in alphabetical order. +edition = "2021" +group_imports = "StdExternalCrate" +imports_granularity = "Item" +merge_derives = false +use_field_init_shorthand = true +version = "Two" diff --git a/tools/sched_ext/scx_layered/src/bpf/intf.h b/tools/sched_ext/scx_layered/src/bpf/intf.h new file mode 100644 index 0000000000000..000f48b4d7502 --- /dev/null +++ b/tools/sched_ext/scx_layered/src/bpf/intf.h @@ -0,0 +1,100 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +#ifndef __INTF_H +#define __INTF_H + +#include +#ifndef __kptr +#ifdef __KERNEL__ +#error "__kptr_ref not defined in the kernel" +#endif +#define __kptr +#endif + +#ifndef __KERNEL__ +typedef unsigned long long u64; +typedef long long s64; +#endif + +#include + +enum consts { + MAX_CPUS_SHIFT = 9, + MAX_CPUS = 1 << MAX_CPUS_SHIFT, + MAX_CPUS_U8 = MAX_CPUS / 8, + MAX_TASKS = 131072, + MAX_PATH = 4096, + MAX_COMM = 16, + MAX_LAYER_MATCH_ORS = 32, + MAX_LAYERS = 16, + USAGE_HALF_LIFE = 100000000, /* 100ms */ + + /* XXX remove */ + MAX_CGRP_PREFIXES = 32 +}; + +/* Statistics */ +enum global_stat_idx { + GSTAT_TASK_CTX_FREE_FAILED, + NR_GSTATS, +}; + +enum layer_stat_idx { + LSTAT_LOCAL, + LSTAT_GLOBAL, + LSTAT_OPEN_IDLE, + LSTAT_AFFN_VIOL, + LSTAT_PREEMPT, + NR_LSTATS, +}; + +struct cpu_ctx { + bool current_preempt; + u64 layer_cycles[MAX_LAYERS]; + u64 gstats[NR_GSTATS]; + u64 lstats[MAX_LAYERS][NR_LSTATS]; +}; + +enum layer_match_kind { + MATCH_CGROUP_PREFIX, + MATCH_COMM_PREFIX, + MATCH_NICE_ABOVE, + MATCH_NICE_BELOW, + + NR_LAYER_MATCH_KINDS, +}; + +struct layer_match { + int kind; + char cgroup_prefix[MAX_PATH]; + char comm_prefix[MAX_COMM]; + int nice_above_or_below; +}; + +struct layer_match_ands { + struct layer_match matches[NR_LAYER_MATCH_KINDS]; + int nr_match_ands; +}; + +struct layer { + struct layer_match_ands matches[MAX_LAYER_MATCH_ORS]; + unsigned int nr_match_ors; + unsigned int idx; + bool open; + bool preempt; + + u64 vtime_now; + u64 nr_tasks; + + u64 load; + struct ravg_data load_rd; + + u64 cpus_seq; + unsigned int refresh_cpus; + unsigned char cpus[MAX_CPUS_U8]; + unsigned int nr_cpus; // managed from BPF side +}; + +#endif /* __INTF_H */ diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c new file mode 100644 index 0000000000000..21dd0e4cd8395 --- /dev/null +++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c @@ -0,0 +1,979 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "intf.h" + +#include +#include +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +const volatile u32 debug = 0; +const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile u32 nr_possible_cpus = 1; +const volatile u32 nr_layers = 1; +const volatile bool smt_enabled = true; +const volatile unsigned char all_cpus[MAX_CPUS_U8]; + +private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask; +struct layer layers[MAX_LAYERS]; +u32 fallback_cpu; +static u32 preempt_cursor; + +#define dbg(fmt, args...) do { if (debug) bpf_printk(fmt, ##args); } while (0) +#define trace(fmt, args...) do { if (debug > 1) bpf_printk(fmt, ##args); } while (0) + +#include "util.bpf.c" + +struct user_exit_info uei; + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, u32); + __type(value, struct cpu_ctx); + __uint(max_entries, 1); +} cpu_ctxs SEC(".maps"); + +static struct cpu_ctx *lookup_cpu_ctx(int cpu) +{ + struct cpu_ctx *cctx; + u32 zero = 0; + + if (cpu < 0) + cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero); + else + cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu); + + if (!cctx) { + scx_bpf_error("no cpu_ctx for cpu %d", cpu); + return NULL; + } + + return cctx; +} + +static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx) +{ + if (idx < 0 || idx >= NR_GSTATS) { + scx_bpf_error("invalid global stat idx %d", idx); + return; + } + + cctx->gstats[idx]++; +} + +static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, struct cpu_ctx *cctx) +{ + u64 *vptr; + + if ((vptr = MEMBER_VPTR(*cctx, .lstats[layer->idx][idx]))) + (*vptr)++; + else + scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx); +} + +struct lock_wrapper { + struct bpf_spin_lock lock; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct lock_wrapper); + __uint(max_entries, MAX_LAYERS); + __uint(map_flags, 0); +} layer_load_locks SEC(".maps"); + +static void adj_load(u32 layer_idx, s64 adj, u64 now) +{ + struct layer *layer; + struct lock_wrapper *lockw; + + layer = MEMBER_VPTR(layers, [layer_idx]); + lockw = bpf_map_lookup_elem(&layer_load_locks, &layer_idx); + + if (!layer || !lockw) { + scx_bpf_error("Can't access layer%d or its load_lock", layer_idx); + return; + } + + bpf_spin_lock(&lockw->lock); + layer->load += adj; + ravg_accumulate(&layer->load_rd, layer->load, now, USAGE_HALF_LIFE); + bpf_spin_unlock(&lockw->lock); + + if (debug && adj < 0 && (s64)layer->load < 0) + scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)", + bpf_get_smp_processor_id(), layer_idx, layer->load, adj); +} + +struct layer_cpumask_wrapper { + struct bpf_cpumask __kptr *cpumask; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct layer_cpumask_wrapper); + __uint(max_entries, MAX_LAYERS); + __uint(map_flags, 0); +} layer_cpumasks SEC(".maps"); + +static struct cpumask *lookup_layer_cpumask(int idx) +{ + struct layer_cpumask_wrapper *cpumaskw; + + if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) { + return (struct cpumask *)cpumaskw->cpumask; + } else { + scx_bpf_error("no layer_cpumask"); + return NULL; + } +} + +static void refresh_cpumasks(int idx) +{ + struct layer_cpumask_wrapper *cpumaskw; + struct layer *layer; + int cpu, total = 0; + + if (!__sync_val_compare_and_swap(&layers[idx].refresh_cpus, 1, 0)) + return; + + cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx); + + bpf_for(cpu, 0, nr_possible_cpus) { + u8 *u8_ptr; + + if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) { + /* + * XXX - The following test should be outside the loop + * but that makes the verifier think that + * cpumaskw->cpumask might be NULL in the loop. + */ + barrier_var(cpumaskw); + if (!cpumaskw || !cpumaskw->cpumask) { + scx_bpf_error("can't happen"); + return; + } + + if (*u8_ptr & (1 << (cpu % 8))) { + bpf_cpumask_set_cpu(cpu, cpumaskw->cpumask); + total++; + } else { + bpf_cpumask_clear_cpu(cpu, cpumaskw->cpumask); + } + } else { + scx_bpf_error("can't happen"); + } + } + + // XXX - shouldn't be necessary + layer = MEMBER_VPTR(layers, [idx]); + if (!layer) { + scx_bpf_error("can't happen"); + return; + } + + layer->nr_cpus = total; + __sync_fetch_and_add(&layer->cpus_seq, 1); + trace("LAYER[%d] now has %d cpus, seq=%llu", idx, layer->nr_cpus, layer->cpus_seq); +} + +SEC("fentry/scheduler_tick") +int scheduler_tick_fentry(const void *ctx) +{ + int idx; + + if (bpf_get_smp_processor_id() == 0) + bpf_for(idx, 0, nr_layers) + refresh_cpumasks(idx); + return 0; +} + +struct task_ctx { + int pid; + + int layer; + bool refresh_layer; + u64 layer_cpus_seq; + struct bpf_cpumask __kptr *layered_cpumask; + + bool all_cpus_allowed; + bool dispatch_local; + u64 started_running_at; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, struct task_ctx); + __uint(max_entries, MAX_TASKS); + __uint(map_flags, 0); +} task_ctxs SEC(".maps"); + +struct task_ctx *lookup_task_ctx_may_fail(struct task_struct *p) +{ + s32 pid = p->pid; + + return bpf_map_lookup_elem(&task_ctxs, &pid); +} + +struct task_ctx *lookup_task_ctx(struct task_struct *p) +{ + struct task_ctx *tctx; + s32 pid = p->pid; + + if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid))) { + return tctx; + } else { + scx_bpf_error("task_ctx lookup failed"); + return NULL; + } +} + +struct layer *lookup_layer(int idx) +{ + if (idx < 0 || idx >= nr_layers) { + scx_bpf_error("invalid layer %d", idx); + return NULL; + } + return &layers[idx]; +} + +/* + * Because the layer membership is by the default hierarchy cgroups rather than + * the CPU controller membership, we can't use ops.cgroup_move(). Let's iterate + * the tasks manually and set refresh_layer. + * + * The iteration isn't synchronized and may fail spuriously. It's not a big + * practical problem as process migrations are very rare in most modern systems. + * That said, we eventually want this to be based on CPU controller membership. + */ +SEC("tp_btf/cgroup_attach_task") +int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, + struct task_struct *leader, bool threadgroup) +{ + struct list_head *thread_head; + struct task_struct *next; + struct task_ctx *tctx; + int leader_pid = leader->pid; + + if (!(tctx = lookup_task_ctx_may_fail(leader))) + return 0; + tctx->refresh_layer = true; + + if (!threadgroup) + return 0; + + thread_head = &leader->signal->thread_head; + + if (!(next = bpf_task_acquire(leader))) { + scx_bpf_error("failed to acquire leader"); + return 0; + } + + bpf_repeat(MAX_TASKS) { + struct task_struct *p; + int pid; + + p = container_of(next->thread_node.next, struct task_struct, thread_node); + bpf_task_release(next); + + if (&p->thread_node == thread_head) { + next = NULL; + break; + } + + pid = BPF_CORE_READ(p, pid); + next = bpf_task_from_pid(pid); + if (!next) { + bpf_printk("scx_layered: tp_cgroup_attach_task: thread iteration failed"); + break; + } + + if ((tctx = lookup_task_ctx(next))) + tctx->refresh_layer = true; + } + + if (next) + bpf_task_release(next); + return 0; +} + +SEC("tp_btf/task_rename") +int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf) +{ + struct task_ctx *tctx; + + if ((tctx = lookup_task_ctx_may_fail(p))) + tctx->refresh_layer = true; + return 0; +} + +static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask, + struct task_struct *p, struct task_ctx *tctx, + const struct cpumask *layer_cpumask) +{ + u64 layer_seq = layers->cpus_seq; + + if (tctx->layer_cpus_seq == layer_seq) + return; + + /* + * XXX - We're assuming that the updated @layer_cpumask matching the new + * @layer_seq is visible which may not be true. For now, leave it as-is. + * Let's update once BPF grows enough memory ordering constructs. + */ + bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr); + tctx->layer_cpus_seq = layer_seq; + trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq); +} + +static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu, + const struct cpumask *idle_cpumask, + const struct cpumask *idle_smtmask) +{ + bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask); + s32 cpu; + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (smt_enabled) { + if (prev_in_cand && + bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) + return prev_cpu; + + cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + return cpu; + } + + if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) + return prev_cpu; + + return scx_bpf_pick_idle_cpu(cand_cpumask, 0); +} + +s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + const struct cpumask *idle_cpumask, *idle_smtmask; + struct cpumask *layer_cpumask, *layered_cpumask; + struct cpu_ctx *cctx; + struct task_ctx *tctx; + struct layer *layer; + s32 cpu; + + /* look up everything we need */ + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) || + !(layered_cpumask = (struct cpumask *)tctx->layered_cpumask)) + return prev_cpu; + + /* + * We usually update the layer in layered_runnable() to avoid confusing. + * As layered_select_cpu() takes place before runnable, new tasks would + * still have -1 layer. Just return @prev_cpu. + */ + if (tctx->layer < 0) + return prev_cpu; + + if (!(layer = lookup_layer(tctx->layer)) || + !(layer_cpumask = lookup_layer_cpumask(tctx->layer))) + return prev_cpu; + + if (!(idle_cpumask = scx_bpf_get_idle_cpumask())) + return prev_cpu; + + if (!(idle_smtmask = scx_bpf_get_idle_smtmask())) { + cpu = prev_cpu; + goto out_put_idle_cpumask; + } + + /* not much to do if bound to a single CPU */ + if (p->nr_cpus_allowed == 1) { + cpu = prev_cpu; + if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + if (!bpf_cpumask_test_cpu(cpu, layer_cpumask)) + lstat_inc(LSTAT_AFFN_VIOL, layer, cctx); + goto dispatch_local; + } else { + goto out_put_cpumasks; + } + } + + maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask); + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu, + idle_cpumask, idle_smtmask)) >= 0) + goto dispatch_local; + + /* + * If the layer is an open one, we can try the whole machine. + */ + if (layer->open && + ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu, + idle_cpumask, idle_smtmask)) >= 0)) { + lstat_inc(LSTAT_OPEN_IDLE, layer, cctx); + goto dispatch_local; + } + + cpu = prev_cpu; + goto out_put_cpumasks; + +dispatch_local: + tctx->dispatch_local = true; +out_put_cpumasks: + scx_bpf_put_idle_cpumask(idle_smtmask); +out_put_idle_cpumask: + scx_bpf_put_idle_cpumask(idle_cpumask); + return cpu; +} + +void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct cpu_ctx *cctx; + struct task_ctx *tctx; + struct layer *layer; + u64 vtime = p->scx.dsq_vtime; + u32 idx; + + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) || + !(layer = lookup_layer(tctx->layer))) + return; + + if (tctx->dispatch_local) { + tctx->dispatch_local = false; + lstat_inc(LSTAT_LOCAL, layer, cctx); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + + lstat_inc(LSTAT_GLOBAL, layer, cctx); + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, layer->vtime_now - slice_ns)) + vtime = layer->vtime_now - slice_ns; + + if (!tctx->all_cpus_allowed) { + lstat_inc(LSTAT_AFFN_VIOL, layer, cctx); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags); + return; + } + + scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags); + + if (!layer->preempt) + return; + + bpf_for(idx, 0, nr_possible_cpus) { + struct cpu_ctx *cand_cctx; + u32 cpu = (preempt_cursor + idx) % nr_possible_cpus; + + if (!all_cpumask || + !bpf_cpumask_test_cpu(cpu, (const struct cpumask *)all_cpumask)) + continue; + if (!(cand_cctx = lookup_cpu_ctx(cpu)) || cand_cctx->current_preempt) + continue; + + scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); + + /* + * Round-robining doesn't have to be strict. Let's not bother + * with atomic ops on $preempt_cursor. + */ + preempt_cursor = (cpu + 1) % nr_possible_cpus; + + lstat_inc(LSTAT_PREEMPT, layer, cctx); + break; + } +} + +void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev) +{ + int idx; + + /* consume preempting layers first */ + bpf_for(idx, 0, nr_layers) + if (layers[idx].preempt && scx_bpf_consume(idx)) + return; + + /* consume !open layers second */ + bpf_for(idx, 0, nr_layers) { + struct layer *layer = &layers[idx]; + struct cpumask *layer_cpumask; + + if (layer->open) + continue; + + /* consume matching layers */ + if (!(layer_cpumask = lookup_layer_cpumask(idx))) + return; + + if (bpf_cpumask_test_cpu(cpu, layer_cpumask) || + (cpu == fallback_cpu && layer->nr_cpus == 0)) { + if (scx_bpf_consume(idx)) + return; + } + } + + /* consume !preempting open layers */ + bpf_for(idx, 0, nr_layers) { + if (!layers[idx].preempt && layers[idx].open && + scx_bpf_consume(idx)) + return; + } +} + +static bool match_one(struct layer_match *match, struct task_struct *p, const char *cgrp_path) +{ + switch (match->kind) { + case MATCH_CGROUP_PREFIX: { + return match_prefix(match->cgroup_prefix, cgrp_path, MAX_PATH); + } + case MATCH_COMM_PREFIX: { + char comm[MAX_COMM]; + memcpy(comm, p->comm, MAX_COMM); + return match_prefix(match->comm_prefix, comm, MAX_COMM); + } + case MATCH_NICE_ABOVE: + return (s32)p->static_prio - 120 > match->nice_above_or_below; + case MATCH_NICE_BELOW: + return (s32)p->static_prio - 120 < match->nice_above_or_below; + default: + scx_bpf_error("invalid match kind %d", match->kind); + return false; + } +} + +static bool match_layer(struct layer *layer, struct task_struct *p, const char *cgrp_path) +{ + u32 nr_match_ors = layer->nr_match_ors; + u64 or_idx, and_idx; + + if (nr_match_ors > MAX_LAYER_MATCH_ORS) { + scx_bpf_error("too many ORs"); + return false; + } + + bpf_for(or_idx, 0, nr_match_ors) { + struct layer_match_ands *ands; + bool matched = true; + + barrier_var(or_idx); + if (or_idx >= MAX_LAYER_MATCH_ORS) + return false; /* can't happen */ + ands = &layer->matches[or_idx]; + + if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) { + scx_bpf_error("too many ANDs"); + return false; + } + + bpf_for(and_idx, 0, ands->nr_match_ands) { + struct layer_match *match; + + barrier_var(and_idx); + if (and_idx >= NR_LAYER_MATCH_KINDS) + return false; /* can't happen */ + match = &ands->matches[and_idx]; + + if (!match_one(match, p, cgrp_path)) { + matched = false; + break; + } + } + + if (matched) + return true; + } + + return false; +} + +static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx) +{ + const char *cgrp_path; + bool matched = false; + u64 idx; // XXX - int makes verifier unhappy + + if (!tctx->refresh_layer) + return; + tctx->refresh_layer = false; + + if (!(cgrp_path = format_cgrp_path(p->cgroups->dfl_cgrp))) + return; + + if (tctx->layer >= 0 && tctx->layer < nr_layers) + __sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1); + + bpf_for(idx, 0, nr_layers) { + if (match_layer(&layers[idx], p, cgrp_path)) { + matched = true; + break; + } + } + + if (matched) { + struct layer *layer = &layers[idx]; + + tctx->layer = idx; + tctx->layer_cpus_seq = layer->cpus_seq - 1; + __sync_fetch_and_add(&layer->nr_tasks, 1); + /* + * XXX - To be correct, we'd need to calculate the vtime + * delta in the previous layer, scale it by the load + * fraction difference and then offset from the new + * layer's vtime_now. For now, just do the simple thing + * and assume the offset to be zero. + * + * Revisit if high frequency dynamic layer switching + * needs to be supported. + */ + p->scx.dsq_vtime = layer->vtime_now; + } else { + scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid); + } + + if (tctx->layer < nr_layers - 1) + trace("LAYER=%d %s[%d] cgrp=\"%s\"", + tctx->layer, p->comm, p->pid, cgrp_path); +} + +void BPF_STRUCT_OPS(layered_runnable, struct task_struct *p, u64 enq_flags) +{ + u64 now = bpf_ktime_get_ns(); + struct task_ctx *tctx; + + if (!(tctx = lookup_task_ctx(p))) + return; + + maybe_refresh_layer(p, tctx); + + adj_load(tctx->layer, p->scx.weight, now); +} + +void BPF_STRUCT_OPS(layered_running, struct task_struct *p) +{ + struct cpu_ctx *cctx; + struct task_ctx *tctx; + struct layer *layer; + + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) || + !(layer = lookup_layer(tctx->layer))) + return; + + if (vtime_before(layer->vtime_now, p->scx.dsq_vtime)) + layer->vtime_now = p->scx.dsq_vtime; + + cctx->current_preempt = layer->preempt; + tctx->started_running_at = bpf_ktime_get_ns(); +} + +void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable) +{ + struct cpu_ctx *cctx; + struct task_ctx *tctx; + u64 used; + u32 layer; + + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) + return; + + layer = tctx->layer; + if (layer >= nr_layers) { + scx_bpf_error("invalid layer %u", layer); + return; + } + + used = bpf_ktime_get_ns() - tctx->started_running_at; + cctx->layer_cycles[layer] += used; + cctx->current_preempt = false; + + /* scale the execution time by the inverse of the weight and charge */ + p->scx.dsq_vtime += used * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(layered_quiescent, struct task_struct *p, u64 deq_flags) +{ + struct task_ctx *tctx; + + if ((tctx = lookup_task_ctx(p))) + adj_load(tctx->layer, -(s64)p->scx.weight, bpf_ktime_get_ns()); +} + +void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight) +{ + struct task_ctx *tctx; + + if ((tctx = lookup_task_ctx(p))) + tctx->refresh_layer = true; +} + +void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{ + struct task_ctx *tctx; + + if (!(tctx = lookup_task_ctx(p))) + return; + + if (!all_cpumask) { + scx_bpf_error("NULL all_cpumask"); + return; + } + + tctx->all_cpus_allowed = + bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask); +} + +s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct task_ctx tctx_init = { + .pid = p->pid, + .layer = -1, + .refresh_layer = true, + }; + struct task_ctx *tctx; + struct bpf_cpumask *cpumask; + s32 pid = p->pid; + s32 ret; + + if (all_cpumask) + tctx_init.all_cpus_allowed = + bpf_cpumask_subset((const struct cpumask *)all_cpumask, p->cpus_ptr); + else + scx_bpf_error("missing all_cpumask"); + + /* + * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may + * fail spuriously due to BPF recursion protection triggering + * unnecessarily. + */ + if ((ret = bpf_map_update_elem(&task_ctxs, &pid, &tctx_init, 0 /*BPF_NOEXIST*/))) { + scx_bpf_error("task_ctx allocation failure, ret=%d", ret); + return ret; + } + + /* + * Read the entry from the map immediately so we can add the cpumask + * with bpf_kptr_xchg(). + */ + if (!(tctx = lookup_task_ctx(p))) + return -ENOENT; + + cpumask = bpf_cpumask_create(); + if (!cpumask) { + bpf_map_delete_elem(&task_ctxs, &pid); + return -ENOMEM; + } + + cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask); + if (cpumask) { + /* Should never happen as we just inserted it above. */ + bpf_cpumask_release(cpumask); + bpf_map_delete_elem(&task_ctxs, &pid); + return -EINVAL; + } + + /* + * We are matching cgroup hierarchy path directly rather than the CPU + * controller path. As the former isn't available during the scheduler + * fork path, let's delay the layer selection until the first + * runnable(). + */ + + return 0; +} + +void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{ + struct cpu_ctx *cctx; + struct task_ctx *tctx; + s32 pid = p->pid; + int ret; + + if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) + return; + + if (tctx->layer >= 0 && tctx->layer < nr_layers) + __sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1); + + /* + * XXX - There's no reason delete should fail here but BPF's recursion + * protection can unnecessarily fail the operation. The fact that + * deletions aren't reliable means that we sometimes leak task_ctx and + * can't use BPF_NOEXIST on allocation in .prep_enable(). + */ + ret = bpf_map_delete_elem(&task_ctxs, &pid); + if (ret) + gstat_inc(GSTAT_TASK_CTX_FREE_FAILED, cctx); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) +{ + struct bpf_cpumask *cpumask; + int i, j, k, nr_online_cpus, ret; + + scx_bpf_switch_all(); + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + nr_online_cpus = 0; + bpf_for(i, 0, nr_possible_cpus) { + const volatile u8 *u8_ptr; + + if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { + if (*u8_ptr & (1 << (i % 8))) { + bpf_cpumask_set_cpu(i, cpumask); + nr_online_cpus++; + } + } else { + return -EINVAL; + } + } + + cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + dbg("CFG: Dumping configuration, nr_online_cpus=%d smt_enabled=%d", + nr_online_cpus, smt_enabled); + + bpf_for(i, 0, nr_layers) { + struct layer *layer = &layers[i]; + + dbg("CFG LAYER[%d] open=%d preempt=%d", + i, layer->open, layer->preempt); + + if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) { + scx_bpf_error("too many ORs"); + return -EINVAL; + } + + bpf_for(j, 0, layer->nr_match_ors) { + struct layer_match_ands *ands = MEMBER_VPTR(layers, [i].matches[j]); + if (!ands) { + scx_bpf_error("shouldn't happen"); + return -EINVAL; + } + + if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) { + scx_bpf_error("too many ANDs"); + return -EINVAL; + } + + dbg("CFG OR[%02d]", j); + + bpf_for(k, 0, ands->nr_match_ands) { + char header[32]; + u64 header_data[1] = { k }; + struct layer_match *match; + + bpf_snprintf(header, sizeof(header), "CFG AND[%02d]:", + header_data, sizeof(header_data)); + + match = MEMBER_VPTR(layers, [i].matches[j].matches[k]); + if (!match) { + scx_bpf_error("shouldn't happen"); + return -EINVAL; + } + + switch (match->kind) { + case MATCH_CGROUP_PREFIX: + dbg("%s CGROUP_PREFIX \"%s\"", header, match->cgroup_prefix); + break; + case MATCH_COMM_PREFIX: + dbg("%s COMM_PREFIX \"%s\"", header, match->comm_prefix); + break; + case MATCH_NICE_ABOVE: + dbg("%s NICE_ABOVE %d", header, match->nice_above_or_below); + break; + case MATCH_NICE_BELOW: + dbg("%s NICE_BELOW %d", header, match->nice_above_or_below); + break; + default: + scx_bpf_error("%s Invalid kind", header); + return -EINVAL; + } + } + if (ands->nr_match_ands == 0) + dbg("CFG DEFAULT"); + } + } + + bpf_for(i, 0, nr_layers) { + struct layer_cpumask_wrapper *cpumaskw; + + layers[i].idx = i; + + ret = scx_bpf_create_dsq(i, -1); + if (ret < 0) + return ret; + + if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &i))) + return -ENONET; + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + + /* + * Start all layers with full cpumask so that everything runs + * everywhere. This will soon be updated by refresh_cpumasks() + * once the scheduler starts running. + */ + bpf_cpumask_setall(cpumask); + + cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + } + + return 0; +} + +void BPF_STRUCT_OPS(layered_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops layered = { + .select_cpu = (void *)layered_select_cpu, + .enqueue = (void *)layered_enqueue, + .dispatch = (void *)layered_dispatch, + .runnable = (void *)layered_runnable, + .running = (void *)layered_running, + .stopping = (void *)layered_stopping, + .quiescent = (void *)layered_quiescent, + .set_weight = (void *)layered_set_weight, + .set_cpumask = (void *)layered_set_cpumask, + .init_task = (void *)layered_init_task, + .exit_task = (void *)layered_exit_task, + .init = (void *)layered_init, + .exit = (void *)layered_exit, + .name = "layered", +}; diff --git a/tools/sched_ext/scx_layered/src/bpf/util.bpf.c b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c new file mode 100644 index 0000000000000..703e0eece60b2 --- /dev/null +++ b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c @@ -0,0 +1,68 @@ +/* to be included in the main bpf.c file */ + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + /* double size because verifier can't follow length calculation */ + __uint(value_size, 2 * MAX_PATH); + __uint(max_entries, 1); +} cgrp_path_bufs SEC(".maps"); + +static char *format_cgrp_path(struct cgroup *cgrp) +{ + u32 zero = 0; + char *path = bpf_map_lookup_elem(&cgrp_path_bufs, &zero); + u32 len = 0, level, max_level; + + if (!path) { + scx_bpf_error("cgrp_path_buf lookup failed"); + return NULL; + } + + max_level = cgrp->level; + if (max_level > 127) + max_level = 127; + + bpf_for(level, 1, max_level + 1) { + int ret; + + if (level > 1 && len < MAX_PATH - 1) + path[len++] = '/'; + + if (len >= MAX_PATH - 1) { + scx_bpf_error("cgrp_path_buf overflow"); + return NULL; + } + + ret = bpf_probe_read_kernel_str(path + len, MAX_PATH - len - 1, + BPF_CORE_READ(cgrp, ancestors[level], kn, name)); + if (ret < 0) { + scx_bpf_error("bpf_probe_read_kernel_str failed"); + return NULL; + } + + len += ret - 1; + } + + if (len >= MAX_PATH - 2) { + scx_bpf_error("cgrp_path_buf overflow"); + return NULL; + } + path[len] = '/'; + path[len + 1] = '\0'; + + return path; +} + +static inline bool match_prefix(const char *prefix, const char *str, u32 max_len) +{ + int c; + + bpf_for(c, 0, max_len) { + if (prefix[c] == '\0') + return true; + if (str[c] != prefix[c]) + return false; + } + return false; +} diff --git a/tools/sched_ext/scx_layered/src/bpf_intf.rs b/tools/sched_ext/scx_layered/src/bpf_intf.rs new file mode 100644 index 0000000000000..0ed31f8e08738 --- /dev/null +++ b/tools/sched_ext/scx_layered/src/bpf_intf.rs @@ -0,0 +1,10 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] + +include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs")); diff --git a/tools/sched_ext/scx_layered/src/bpf_skel.rs b/tools/sched_ext/scx_layered/src/bpf_skel.rs new file mode 100644 index 0000000000000..063ccf896d61e --- /dev/null +++ b/tools/sched_ext/scx_layered/src/bpf_skel.rs @@ -0,0 +1,12 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +// We can't directly include the generated skeleton in main.rs as it may +// contain compiler attributes that can't be `include!()`ed via macro and we +// can't use the `#[path = "..."]` because `concat!(env!("OUT_DIR"), +// "/bpf.skel.rs")` does not work inside the path attribute yet (see +// https://github.com/rust-lang/rust/pull/83366). + +include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs")); diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs new file mode 100644 index 0000000000000..5b5374226f49a --- /dev/null +++ b/tools/sched_ext/scx_layered/src/main.rs @@ -0,0 +1,1639 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +mod bpf_skel; +pub use bpf_skel::*; +pub mod bpf_intf; + +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::ffi::CStr; +use std::ffi::CString; +use std::fs; +use std::io::Read; +use std::io::Write; +use std::ops::Sub; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::time::Duration; +use std::time::Instant; + +use ::fb_procfs as procfs; +use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context; +use anyhow::Result; +use bitvec::prelude::*; +use clap::Parser; +use libbpf_rs::skel::OpenSkel as _; +use libbpf_rs::skel::Skel as _; +use libbpf_rs::skel::SkelBuilder as _; +use log::debug; +use log::info; +use log::trace; +use scx_utils::ravg::ravg_read; +use serde::Deserialize; +use serde::Serialize; + +const RAVG_FRAC_BITS: u32 = bpf_intf::ravg_consts_RAVG_FRAC_BITS; +const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize; +const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize; +const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize; +const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize; +const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize; +const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE; +const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0; +const NR_GSTATS: usize = bpf_intf::global_stat_idx_NR_GSTATS as usize; +const NR_LSTATS: usize = bpf_intf::layer_stat_idx_NR_LSTATS as usize; +const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize; +const CORE_CACHE_LEVEL: u32 = 2; + +lazy_static::lazy_static! { + static ref NR_POSSIBLE_CPUS: usize = libbpf_rs::num_possible_cpus().unwrap(); + static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64); +} + +/// scx_layered: A highly configurable multi-layer sched_ext scheduler +/// +/// scx_layered allows classifying tasks into multiple layers and applying +/// different scheduling policies to them. The configuration is specified in +/// json and composed of two parts - matches and policies. +/// +/// Matches +/// ======= +/// +/// Whenever a task is forked or its attributes are changed, the task goes +/// through a series of matches to determine the layer it belongs to. A +/// match set is composed of OR groups of AND blocks. An example: +/// +/// "matches": [ +/// [ +/// { +/// "CgroupPrefix": "system.slice/" +/// } +/// ], +/// [ +/// { +/// "CommPrefix": "fbagent" +/// }, +/// { +/// "NiceAbove": 0 +/// } +/// ] +/// ], +/// +/// The outer array contains the OR groups and the inner AND blocks, so the +/// above matches: +/// +/// * Tasks which are in the cgroup sub-hierarchy under "system.slice". +/// * Or tasks whose comm starts with "fbagent" and have a nice value > 0. +/// +/// Currently, the following matches are supported: +/// +/// * CgroupPrefix: Matches the prefix of the cgroup that the task belongs +/// to. As this is a string match, whether the pattern has the trailing +/// '/' makes a difference. For example, "TOP/CHILD/" only matches tasks +/// which are under that particular cgroup while "TOP/CHILD" also matches +/// tasks under "TOP/CHILD0/" or "TOP/CHILD1/". +/// +/// * CommPrefix: Matches the task's comm prefix. +/// +/// * NiceAbove: Matches if the task's nice value is greater than the +/// pattern. +/// +/// * NiceBelow: Matches if the task's nice value is smaller than the +/// pattern. +/// +/// While there are complexity limitations as the matches are performed in +/// BPF, it is straightforward to add more types of matches. +/// +/// Policies +/// ======== +/// +/// The following is an example policy configuration for a layer. +/// +/// "kind": { +/// "Confined": { +/// "cpus_range": [1, 8], +/// "util_range": [0.8, 0.9] +/// } +/// } +/// +/// It's of "Confined" kind, which tries to concentrate the layer's tasks +/// into a limited number of CPUs. In the above case, the number of CPUs +/// assigned to the layer is scaled between 1 and 8 so that the per-cpu +/// utilization is kept between 80% and 90%. If the CPUs are loaded higher +/// than 90%, more CPUs are allocated to the layer. If the utilization drops +/// below 80%, the layer loses CPUs. +/// +/// Currently, the following policy kinds are supported: +/// +/// * Confined: Tasks are restricted to the allocated CPUs. The number of +/// CPUs allocated is modulated to keep the per-CPU utilization in +/// "util_range". The range can optionally be restricted with the +/// "cpus_range" property. +/// +/// * Grouped: Similar to Confined but tasks may spill outside if there are +/// idle CPUs outside the allocated ones. If "preempt" is true, tasks in +/// this layer will preempt tasks which belong to other non-preempting +/// layers when no idle CPUs are available. +/// +/// * Open: Prefer the CPUs which are not occupied by Confined or Grouped +/// layers. Tasks in this group will spill into occupied CPUs if there are +/// no unoccupied idle CPUs. If "preempt" is true, tasks in this layer +/// will preempt tasks which belong to other non-preempting layers when no +/// idle CPUs are available. +/// +/// Similar to matches, adding new policies and extending existing ones +/// should be relatively straightforward. +/// +/// Configuration example and running scx_layered +/// ============================================= +/// +/// A scx_layered config is composed of layer configs and a layer config is +/// composed of a name, a set of matches and a policy block. Running the +/// following will write an example configuration into example.json. +/// +/// $ scx_layered -e example.json +/// +/// Note that the last layer in the configuration must have an empty match +/// set as it must match all tasks which haven't been matched into previous +/// layers. +/// +/// The configuration can be specified in multiple json files and command +/// line arguments. Each must contain valid layer configurations and they're +/// concatenated in the specified order. In most cases, something like the +/// following should do. +/// +/// $ scx_layered file:example.json +/// +/// Statistics +/// ========== +/// +/// scx_layered will print out a set of statistics every monitoring +/// interval. +/// +/// tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 tctx_err=9 proc=6ms +/// busy= 34.2 util= 1733.6 load= 21744.1 fallback_cpu= 1 +/// batch : util/frac= 11.8/ 0.7 load/frac= 29.7: 0.1 tasks= 2597 +/// tot= 3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00 +/// cpus= 2 [ 2, 2] 04000001 00000000 +/// immediate: util/frac= 1218.8/ 70.3 load/frac= 21399.9: 98.4 tasks= 1107 +/// tot= 68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00 +/// cpus= 50 [ 50, 50] fbfffffe 000fffff +/// normal : util/frac= 502.9/ 29.0 load/frac= 314.5: 1.4 tasks= 3512 +/// tot= 45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56 +/// cpus= 50 [ 50, 50] fbfffffe 000fffff +/// +/// Global statistics: +/// +/// - tot: Total scheduling events in the period. +/// +/// - local: % that got scheduled directly into an idle CPU. +/// +/// - open_idle: % of open layer tasks scheduled into occupied idle CPUs. +/// +/// - affn_viol: % which violated configured policies due to CPU affinity +/// restrictions. +/// +/// - proc: CPU time this binary consumed during the period. +/// +/// - busy: CPU busy % (100% means all CPUs were fully occupied) +/// +/// - util: CPU utilization % (100% means one CPU was fully occupied) +/// +/// - load: Sum of weight * duty_cycle for all tasks +/// +/// Per-layer statistics: +/// +/// - util/frac: CPU utilization and fraction % (sum of fractions across +/// layers is always 100%). +/// +/// - load/frac: Load sum and fraction %. +/// +/// - tasks: Number of tasks. +/// +/// - tot: Total scheduling events. +/// +/// - open_idle: % of tasks scheduled into idle CPUs occupied by other layers. +/// +/// - preempt: % of tasks that preempted other tasks. +/// +/// - affn_viol: % which violated configured policies due to CPU affinity +/// restrictions. +/// +/// - cpus: CUR_NR_CPUS [MIN_NR_CPUS, MAX_NR_CPUS] CUR_CPU_MASK +/// +#[derive(Debug, Parser)] +#[command(verbatim_doc_comment)] +struct Opts { + /// Scheduling slice duration in microseconds. + #[clap(short = 's', long, default_value = "20000")] + slice_us: u64, + + /// Scheduling interval in seconds. + #[clap(short = 'i', long, default_value = "0.1")] + interval: f64, + + /// Monitoring interval in seconds. + #[clap(short = 'm', long, default_value = "2.0")] + monitor: f64, + + /// Disable load-fraction based max layer CPU limit. ***NOTE*** + /// load-fraction calculation is currently broken due to lack of + /// infeasible weight adjustments. Setting this option is recommended. + #[clap(short = 'n', long)] + no_load_frac_limit: bool, + + /// Enable verbose output including libbpf details. Specify multiple + /// times to increase verbosity. + #[clap(short = 'v', long, action = clap::ArgAction::Count)] + verbose: u8, + + /// Write example layer specifications into the file and exit. + #[clap(short = 'e', long)] + example: Option, + + /// Layer specification. See --help. + specs: Vec, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +enum LayerMatch { + CgroupPrefix(String), + CommPrefix(String), + NiceAbove(i32), + NiceBelow(i32), +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +enum LayerKind { + Confined { + cpus_range: Option<(usize, usize)>, + util_range: (f64, f64), + }, + Grouped { + cpus_range: Option<(usize, usize)>, + util_range: (f64, f64), + preempt: bool, + }, + Open { + preempt: bool, + }, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +struct LayerSpec { + name: String, + comment: Option, + matches: Vec>, + kind: LayerKind, +} + +impl LayerSpec { + fn parse(input: &str) -> Result> { + let config: LayerConfig = if input.starts_with("f:") || input.starts_with("file:") { + let mut f = fs::OpenOptions::new() + .read(true) + .open(input.split_once(':').unwrap().1)?; + let mut content = String::new(); + f.read_to_string(&mut content)?; + serde_json::from_str(&content)? + } else { + serde_json::from_str(input)? + }; + Ok(config.specs) + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(transparent)] +struct LayerConfig { + specs: Vec, +} + +fn now_monotonic() -> u64 { + let mut time = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) }; + assert!(ret == 0); + time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64 +} + +fn read_total_cpu(reader: &procfs::ProcReader) -> Result { + reader + .read_stat() + .context("Failed to read procfs")? + .total_cpu + .ok_or_else(|| anyhow!("Could not read total cpu stat in proc")) +} + +fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result { + match (curr, prev) { + ( + procfs::CpuStat { + user_usec: Some(curr_user), + nice_usec: Some(curr_nice), + system_usec: Some(curr_system), + idle_usec: Some(curr_idle), + iowait_usec: Some(curr_iowait), + irq_usec: Some(curr_irq), + softirq_usec: Some(curr_softirq), + stolen_usec: Some(curr_stolen), + .. + }, + procfs::CpuStat { + user_usec: Some(prev_user), + nice_usec: Some(prev_nice), + system_usec: Some(prev_system), + idle_usec: Some(prev_idle), + iowait_usec: Some(prev_iowait), + irq_usec: Some(prev_irq), + softirq_usec: Some(prev_softirq), + stolen_usec: Some(prev_stolen), + .. + }, + ) => { + let idle_usec = curr_idle - prev_idle; + let iowait_usec = curr_iowait - prev_iowait; + let user_usec = curr_user - prev_user; + let system_usec = curr_system - prev_system; + let nice_usec = curr_nice - prev_nice; + let irq_usec = curr_irq - prev_irq; + let softirq_usec = curr_softirq - prev_softirq; + let stolen_usec = curr_stolen - prev_stolen; + + let busy_usec = + user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; + let total_usec = idle_usec + busy_usec + iowait_usec; + if total_usec > 0 { + Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0)) + } else { + Ok(1.0) + } + } + _ => { + bail!("Missing stats in cpustat"); + } + } +} + +fn copy_into_cstr(dst: &mut [i8], src: &str) { + let cstr = CString::new(src).unwrap(); + let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) }; + dst[0..bytes.len()].copy_from_slice(bytes); +} + +fn format_bitvec(bitvec: &BitVec) -> String { + let mut vals = Vec::::new(); + let mut val: u32 = 0; + for (idx, bit) in bitvec.iter().enumerate() { + if idx > 0 && idx % 32 == 0 { + vals.push(val); + val = 0; + } + if *bit { + val |= 1 << (idx % 32); + } + } + vals.push(val); + let mut output = vals + .iter() + .fold(String::new(), |string, v| format!("{}{:08x} ", string, v)); + output.pop(); + output +} + +fn read_cpu_ctxs(skel: &BpfSkel) -> Result> { + let mut cpu_ctxs = vec![]; + let cpu_ctxs_vec = skel + .maps() + .cpu_ctxs() + .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY) + .context("Failed to lookup cpu_ctx")? + .unwrap(); + for cpu in 0..*NR_POSSIBLE_CPUS { + cpu_ctxs.push(*unsafe { + &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx) + }); + } + Ok(cpu_ctxs) +} + +#[derive(Clone, Debug)] +struct BpfStats { + gstats: Vec, + lstats: Vec>, + lstats_sums: Vec, +} + +impl BpfStats { + fn read(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Self { + let mut gstats = vec![0u64; NR_GSTATS]; + let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers]; + + for cpu in 0..*NR_POSSIBLE_CPUS { + for stat in 0..NR_GSTATS { + gstats[stat] += cpu_ctxs[cpu].gstats[stat]; + } + for layer in 0..nr_layers { + for stat in 0..NR_LSTATS { + lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat]; + } + } + } + + let mut lstats_sums = vec![0u64; NR_LSTATS]; + for layer in 0..nr_layers { + for stat in 0..NR_LSTATS { + lstats_sums[stat] += lstats[layer][stat]; + } + } + + Self { + gstats, + lstats, + lstats_sums, + } + } +} + +impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats { + type Output = BpfStats; + + fn sub(self, rhs: &'b BpfStats) -> BpfStats { + let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect(); + BpfStats { + gstats: vec_sub(&self.gstats, &rhs.gstats), + lstats: self + .lstats + .iter() + .zip(rhs.lstats.iter()) + .map(|(l, r)| vec_sub(l, r)) + .collect(), + lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums), + } + } +} + +struct Stats { + nr_layers: usize, + at: Instant, + + nr_layer_tasks: Vec, + + total_load: f64, + layer_loads: Vec, + + total_util: f64, // Running AVG of sum of layer_utils + layer_utils: Vec, + prev_layer_cycles: Vec, + + cpu_busy: f64, // Read from /proc, maybe higher than total_util + prev_total_cpu: procfs::CpuStat, + + bpf_stats: BpfStats, + prev_bpf_stats: BpfStats, +} + +impl Stats { + fn read_layer_loads(skel: &mut BpfSkel, nr_layers: usize) -> (f64, Vec) { + let now_mono = now_monotonic(); + let layer_loads: Vec = skel + .bss() + .layers + .iter() + .take(nr_layers) + .map(|layer| { + let rd = &layer.load_rd; + ravg_read( + rd.val, + rd.val_at, + rd.old, + rd.cur, + now_mono, + USAGE_HALF_LIFE, + RAVG_FRAC_BITS, + ) + }) + .collect(); + (layer_loads.iter().sum(), layer_loads) + } + + fn read_layer_cycles(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec { + let mut layer_cycles = vec![0u64; nr_layers]; + + for cpu in 0..*NR_POSSIBLE_CPUS { + for layer in 0..nr_layers { + layer_cycles[layer] += cpu_ctxs[cpu].layer_cycles[layer]; + } + } + + layer_cycles + } + + fn new(skel: &mut BpfSkel, proc_reader: &procfs::ProcReader) -> Result { + let nr_layers = skel.rodata().nr_layers as usize; + let bpf_stats = BpfStats::read(&read_cpu_ctxs(skel)?, nr_layers); + + Ok(Self { + at: Instant::now(), + nr_layers, + + nr_layer_tasks: vec![0; nr_layers], + + total_load: 0.0, + layer_loads: vec![0.0; nr_layers], + + total_util: 0.0, + layer_utils: vec![0.0; nr_layers], + prev_layer_cycles: vec![0; nr_layers], + + cpu_busy: 0.0, + prev_total_cpu: read_total_cpu(&proc_reader)?, + + bpf_stats: bpf_stats.clone(), + prev_bpf_stats: bpf_stats, + }) + } + + fn refresh( + &mut self, + skel: &mut BpfSkel, + proc_reader: &procfs::ProcReader, + now: Instant, + ) -> Result<()> { + let elapsed = now.duration_since(self.at).as_secs_f64() as f64; + let cpu_ctxs = read_cpu_ctxs(skel)?; + + let nr_layer_tasks: Vec = skel + .bss() + .layers + .iter() + .take(self.nr_layers) + .map(|layer| layer.nr_tasks as usize) + .collect(); + + let (total_load, layer_loads) = Self::read_layer_loads(skel, self.nr_layers); + + let cur_layer_cycles = Self::read_layer_cycles(&cpu_ctxs, self.nr_layers); + let cur_layer_utils: Vec = cur_layer_cycles + .iter() + .zip(self.prev_layer_cycles.iter()) + .map(|(cur, prev)| (cur - prev) as f64 / 1_000_000_000.0 / elapsed) + .collect(); + let layer_utils: Vec = cur_layer_utils + .iter() + .zip(self.layer_utils.iter()) + .map(|(cur, prev)| { + let decay = USAGE_DECAY.powf(elapsed); + prev * decay + cur * (1.0 - decay) + }) + .collect(); + + let cur_total_cpu = read_total_cpu(proc_reader)?; + let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?; + + let cur_bpf_stats = BpfStats::read(&cpu_ctxs, self.nr_layers); + let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats; + + *self = Self { + at: now, + nr_layers: self.nr_layers, + + nr_layer_tasks, + + total_load, + layer_loads, + + total_util: layer_utils.iter().sum(), + layer_utils: layer_utils.try_into().unwrap(), + prev_layer_cycles: cur_layer_cycles, + + cpu_busy, + prev_total_cpu: cur_total_cpu, + + bpf_stats, + prev_bpf_stats: cur_bpf_stats, + }; + Ok(()) + } +} + +#[derive(Debug, Default)] +struct UserExitInfo { + kind: i32, + reason: Option, + msg: Option, +} + +impl UserExitInfo { + fn read(bpf_uei: &bpf_bss_types::user_exit_info) -> Result { + let kind = unsafe { std::ptr::read_volatile(&bpf_uei.kind as *const _) }; + + let (reason, msg) = if kind != 0 { + ( + Some( + unsafe { CStr::from_ptr(bpf_uei.reason.as_ptr() as *const _) } + .to_str() + .context("Failed to convert reason to string")? + .to_string(), + ) + .filter(|s| !s.is_empty()), + Some( + unsafe { CStr::from_ptr(bpf_uei.msg.as_ptr() as *const _) } + .to_str() + .context("Failed to convert msg to string")? + .to_string(), + ) + .filter(|s| !s.is_empty()), + ) + } else { + (None, None) + }; + + Ok(Self { kind, reason, msg }) + } + + fn exited(bpf_uei: &bpf_bss_types::user_exit_info) -> Result { + Ok(Self::read(bpf_uei)?.kind != 0) + } + + fn report(&self) -> Result<()> { + let why = match (&self.reason, &self.msg) { + (Some(reason), None) => format!("{}", reason), + (Some(reason), Some(msg)) => format!("{} ({})", reason, msg), + _ => "".into(), + }; + + match self.kind { + 0 => Ok(()), + etype => { + if etype != 64 { + bail!("EXIT: kind={} {}", etype, why); + } else { + info!("EXIT: {}", why); + Ok(()) + } + } + } + } +} + +#[derive(Debug)] +struct CpuPool { + nr_cores: usize, + nr_cpus: usize, + all_cpus: BitVec, + core_cpus: Vec, + cpu_core: Vec, + available_cores: BitVec, + first_cpu: usize, + fallback_cpu: usize, // next free or the first CPU if none is free +} + +impl CpuPool { + fn new() -> Result { + if *NR_POSSIBLE_CPUS > MAX_CPUS { + bail!( + "NR_POSSIBLE_CPUS {} > MAX_CPUS {}", + *NR_POSSIBLE_CPUS, + MAX_CPUS + ); + } + + let mut cpu_to_cache = vec![]; // (cpu_id, Option) + let mut cache_ids = BTreeSet::::new(); + let mut nr_offline = 0; + + // Build cpu -> cache ID mapping. + for cpu in 0..*NR_POSSIBLE_CPUS { + let path = format!( + "/sys/devices/system/cpu/cpu{}/cache/index{}/id", + cpu, CORE_CACHE_LEVEL + ); + let id = match std::fs::read_to_string(&path) { + Ok(val) => Some(val.trim().parse::().with_context(|| { + format!("Failed to parse {:?}'s content {:?}", &path, &val) + })?), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + nr_offline += 1; + None + } + Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)), + }; + + cpu_to_cache.push(id); + if let Some(id) = id { + cache_ids.insert(id); + } + } + + let nr_cpus = *NR_POSSIBLE_CPUS - nr_offline; + + // Cache IDs may have holes. Assign consecutive core IDs to existing + // cache IDs. + let mut cache_to_core = BTreeMap::::new(); + let mut nr_cores = 0; + for cache_id in cache_ids.iter() { + cache_to_core.insert(*cache_id, nr_cores); + nr_cores += 1; + } + + // Build core -> cpumask and cpu -> core mappings. + let mut all_cpus = bitvec![0; *NR_POSSIBLE_CPUS]; + let mut core_cpus = vec![bitvec![0; *NR_POSSIBLE_CPUS]; nr_cores]; + let mut cpu_core = vec![]; + + for (cpu, cache) in cpu_to_cache.iter().enumerate().take(*NR_POSSIBLE_CPUS) { + if let Some(cache_id) = cache { + let core_id = cache_to_core[cache_id]; + all_cpus.set(cpu, true); + core_cpus[core_id].set(cpu, true); + cpu_core.push(core_id); + } + } + + info!( + "CPUs: online/possible={}/{} nr_cores={}", + nr_cpus, *NR_POSSIBLE_CPUS, nr_cores, + ); + + let first_cpu = core_cpus[0].first_one().unwrap(); + + let mut cpu_pool = Self { + nr_cores, + nr_cpus, + all_cpus, + core_cpus, + cpu_core, + available_cores: bitvec![1; nr_cores], + first_cpu, + fallback_cpu: first_cpu, + }; + cpu_pool.update_fallback_cpu(); + Ok(cpu_pool) + } + + fn update_fallback_cpu(&mut self) { + match self.available_cores.first_one() { + Some(next) => self.fallback_cpu = self.core_cpus[next].first_one().unwrap(), + None => self.fallback_cpu = self.first_cpu, + } + } + + fn alloc<'a>(&'a mut self) -> Option<&'a BitVec> { + let core = self.available_cores.first_one()?; + self.available_cores.set(core, false); + self.update_fallback_cpu(); + Some(&self.core_cpus[core]) + } + + fn cpus_to_cores(&self, cpus_to_match: &BitVec) -> Result { + let mut cpus = cpus_to_match.clone(); + let mut cores = bitvec![0; self.nr_cores]; + + while let Some(cpu) = cpus.first_one() { + let core = self.cpu_core[cpu]; + + if (self.core_cpus[core].clone() & !cpus.clone()).count_ones() != 0 { + bail!( + "CPUs {} partially intersect with core {} ({})", + cpus_to_match, + core, + self.core_cpus[core], + ); + } + + cpus &= !self.core_cpus[core].clone(); + cores.set(core, true); + } + + Ok(cores) + } + + fn free<'a>(&'a mut self, cpus_to_free: &BitVec) -> Result<()> { + let cores = self.cpus_to_cores(cpus_to_free)?; + if (self.available_cores.clone() & &cores).any() { + bail!("Some of CPUs {} are already free", cpus_to_free); + } + self.available_cores |= cores; + self.update_fallback_cpu(); + Ok(()) + } + + fn next_to_free<'a>(&'a self, cands: &BitVec) -> Result> { + let last = match cands.last_one() { + Some(ret) => ret, + None => return Ok(None), + }; + let core = self.cpu_core[last]; + if (self.core_cpus[core].clone() & !cands.clone()).count_ones() != 0 { + bail!( + "CPUs{} partially intersect with core {} ({})", + cands, + core, + self.core_cpus[core] + ); + } + + Ok(Some(&self.core_cpus[core])) + } + + fn available_cpus(&self) -> BitVec { + let mut cpus = bitvec![0; self.nr_cpus]; + for core in self.available_cores.iter_ones() { + cpus |= &self.core_cpus[core]; + } + cpus + } +} + +#[derive(Debug)] +struct Layer { + name: String, + kind: LayerKind, + + nr_cpus: usize, + cpus: BitVec, +} + +impl Layer { + fn new(cpu_pool: &mut CpuPool, name: &str, kind: LayerKind) -> Result { + match &kind { + LayerKind::Confined { + cpus_range, + util_range, + } => { + let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX)); + if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 { + bail!("invalid cpus_range {:?}", cpus_range); + } + if util_range.0 < 0.0 + || util_range.0 > 1.0 + || util_range.1 < 0.0 + || util_range.1 > 1.0 + || util_range.0 >= util_range.1 + { + bail!("invalid util_range {:?}", util_range); + } + } + _ => {} + } + + let nr_cpus = cpu_pool.nr_cpus; + + let mut layer = Self { + name: name.into(), + kind, + + nr_cpus: 0, + cpus: bitvec![0; nr_cpus], + }; + + match &layer.kind { + LayerKind::Confined { + cpus_range, + util_range, + } + | LayerKind::Grouped { + cpus_range, + util_range, + .. + } => { + layer.resize_confined_or_grouped( + cpu_pool, + *cpus_range, + *util_range, + (0.0, 0.0), + (0.0, 0.0), + false, + )?; + } + _ => {} + } + + Ok(layer) + } + + fn grow_confined_or_grouped( + &mut self, + cpu_pool: &mut CpuPool, + (cpus_min, cpus_max): (usize, usize), + (_util_low, util_high): (f64, f64), + (layer_load, total_load): (f64, f64), + (layer_util, _total_util): (f64, f64), + no_load_frac_limit: bool, + ) -> Result { + if self.nr_cpus >= cpus_max { + return Ok(false); + } + + // Do we already have enough? + if self.nr_cpus >= cpus_min + && (layer_util == 0.0 + || (self.nr_cpus > 0 && layer_util / self.nr_cpus as f64 <= util_high)) + { + return Ok(false); + } + + // Can't have more CPUs than our load fraction. + if !no_load_frac_limit + && self.nr_cpus >= cpus_min + && (total_load >= 0.0 + && self.nr_cpus as f64 / cpu_pool.nr_cpus as f64 >= layer_load / total_load) + { + trace!( + "layer-{} needs more CPUs (util={:.3}) but is over the load fraction", + &self.name, + layer_util + ); + return Ok(false); + } + + let new_cpus = match cpu_pool.alloc().clone() { + Some(ret) => ret.clone(), + None => { + trace!("layer-{} can't grow, no CPUs", &self.name); + return Ok(false); + } + }; + + trace!( + "layer-{} adding {} CPUs to {} CPUs", + &self.name, + new_cpus.count_ones(), + self.nr_cpus + ); + + self.nr_cpus += new_cpus.count_ones(); + self.cpus |= &new_cpus; + Ok(true) + } + + fn cpus_to_free( + &self, + cpu_pool: &mut CpuPool, + (cpus_min, _cpus_max): (usize, usize), + (util_low, util_high): (f64, f64), + (layer_load, total_load): (f64, f64), + (layer_util, _total_util): (f64, f64), + no_load_frac_limit: bool, + ) -> Result> { + if self.nr_cpus <= cpus_min { + return Ok(None); + } + + let cpus_to_free = match cpu_pool.next_to_free(&self.cpus)? { + Some(ret) => ret.clone(), + None => return Ok(None), + }; + + let nr_to_free = cpus_to_free.count_ones(); + + // If we'd be over the load fraction even after freeing + // $cpus_to_free, we have to free. + if !no_load_frac_limit + && total_load >= 0.0 + && (self.nr_cpus - nr_to_free) as f64 / cpu_pool.nr_cpus as f64 + >= layer_load / total_load + { + return Ok(Some(cpus_to_free)); + } + + if layer_util / self.nr_cpus as f64 >= util_low { + return Ok(None); + } + + // Can't shrink if losing the CPUs pushes us over @util_high. + match self.nr_cpus - nr_to_free { + 0 => { + if layer_util > 0.0 { + return Ok(None); + } + } + nr_left => { + if layer_util / nr_left as f64 >= util_high { + return Ok(None); + } + } + } + + return Ok(Some(cpus_to_free)); + } + + fn shrink_confined_or_grouped( + &mut self, + cpu_pool: &mut CpuPool, + cpus_range: (usize, usize), + util_range: (f64, f64), + load: (f64, f64), + util: (f64, f64), + no_load_frac_limit: bool, + ) -> Result { + match self.cpus_to_free( + cpu_pool, + cpus_range, + util_range, + load, + util, + no_load_frac_limit, + )? { + Some(cpus_to_free) => { + trace!("freeing CPUs {}", &cpus_to_free); + self.nr_cpus -= cpus_to_free.count_ones(); + self.cpus &= !cpus_to_free.clone(); + cpu_pool.free(&cpus_to_free)?; + Ok(true) + } + None => Ok(false), + } + } + + fn resize_confined_or_grouped( + &mut self, + cpu_pool: &mut CpuPool, + cpus_range: Option<(usize, usize)>, + util_range: (f64, f64), + load: (f64, f64), + util: (f64, f64), + no_load_frac_limit: bool, + ) -> Result { + let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX)); + let mut adjusted = 0; + + while self.grow_confined_or_grouped( + cpu_pool, + cpus_range, + util_range, + load, + util, + no_load_frac_limit, + )? { + adjusted += 1; + trace!("{} grew, adjusted={}", &self.name, adjusted); + } + + if adjusted == 0 { + while self.shrink_confined_or_grouped( + cpu_pool, + cpus_range, + util_range, + load, + util, + no_load_frac_limit, + )? { + adjusted -= 1; + trace!("{} shrunk, adjusted={}", &self.name, adjusted); + } + } + + if adjusted != 0 { + trace!("{} done resizing, adjusted={}", &self.name, adjusted); + } + Ok(adjusted) + } +} + +struct Scheduler<'a> { + skel: BpfSkel<'a>, + struct_ops: Option, + layer_specs: Vec, + + sched_intv: Duration, + monitor_intv: Duration, + no_load_frac_limit: bool, + + cpu_pool: CpuPool, + layers: Vec, + + proc_reader: procfs::ProcReader, + sched_stats: Stats, + report_stats: Stats, + + nr_layer_cpus_min_max: Vec<(usize, usize)>, + processing_dur: Duration, + prev_processing_dur: Duration, +} + +impl<'a> Scheduler<'a> { + fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec) -> Result<()> { + skel.rodata_mut().nr_layers = specs.len() as u32; + + for (spec_i, spec) in specs.iter().enumerate() { + let layer = &mut skel.bss_mut().layers[spec_i]; + + for (or_i, or) in spec.matches.iter().enumerate() { + for (and_i, and) in or.iter().enumerate() { + let mt = &mut layer.matches[or_i].matches[and_i]; + match and { + LayerMatch::CgroupPrefix(prefix) => { + mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32; + copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str()); + } + LayerMatch::CommPrefix(prefix) => { + mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32; + copy_into_cstr(&mut mt.comm_prefix, prefix.as_str()); + } + LayerMatch::NiceAbove(nice) => { + mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32; + mt.nice_above_or_below = *nice; + } + LayerMatch::NiceBelow(nice) => { + mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32; + mt.nice_above_or_below = *nice; + } + } + } + layer.matches[or_i].nr_match_ands = or.len() as i32; + } + + layer.nr_match_ors = spec.matches.len() as u32; + + match &spec.kind { + LayerKind::Open { preempt } | LayerKind::Grouped { preempt, .. } => { + layer.open = true; + layer.preempt = *preempt; + } + _ => {} + } + } + + Ok(()) + } + + fn init(opts: &Opts, layer_specs: Vec) -> Result { + let nr_layers = layer_specs.len(); + let mut cpu_pool = CpuPool::new()?; + + // Open the BPF prog first for verification. + let mut skel_builder = BpfSkelBuilder::default(); + skel_builder.obj_builder.debug(opts.verbose > 1); + let mut skel = skel_builder.open().context("Failed to open BPF program")?; + + // Initialize skel according to @opts. + skel.rodata_mut().debug = opts.verbose as u32; + skel.rodata_mut().slice_ns = opts.slice_us * 1000; + skel.rodata_mut().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32; + skel.rodata_mut().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores; + for cpu in cpu_pool.all_cpus.iter_ones() { + skel.rodata_mut().all_cpus[cpu / 8] |= 1 << (cpu % 8); + } + Self::init_layers(&mut skel, &layer_specs)?; + + // Attach. + let mut skel = skel.load().context("Failed to load BPF program")?; + skel.attach().context("Failed to attach BPF program")?; + let struct_ops = Some( + skel.maps_mut() + .layered() + .attach_struct_ops() + .context("Failed to attach layered struct ops")?, + ); + info!("Layered Scheduler Attached"); + + let mut layers = vec![]; + for spec in layer_specs.iter() { + layers.push(Layer::new(&mut cpu_pool, &spec.name, spec.kind.clone())?); + } + + // Other stuff. + let proc_reader = procfs::ProcReader::new(); + + Ok(Self { + struct_ops, // should be held to keep it attached + layer_specs, + + sched_intv: Duration::from_secs_f64(opts.interval), + monitor_intv: Duration::from_secs_f64(opts.monitor), + no_load_frac_limit: opts.no_load_frac_limit, + + cpu_pool, + layers, + + sched_stats: Stats::new(&mut skel, &proc_reader)?, + report_stats: Stats::new(&mut skel, &proc_reader)?, + + nr_layer_cpus_min_max: vec![(0, 0); nr_layers], + processing_dur: Duration::from_millis(0), + prev_processing_dur: Duration::from_millis(0), + + proc_reader, + skel, + }) + } + + fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut bpf_bss_types::layer) { + for bit in 0..layer.cpus.len() { + if layer.cpus[bit] { + bpf_layer.cpus[bit / 8] |= 1 << (bit % 8); + } else { + bpf_layer.cpus[bit / 8] &= !(1 << (bit % 8)); + } + } + bpf_layer.refresh_cpus = 1; + } + + fn step(&mut self) -> Result<()> { + let started_at = Instant::now(); + self.sched_stats + .refresh(&mut self.skel, &self.proc_reader, started_at)?; + let mut updated = false; + + for idx in 0..self.layers.len() { + match self.layers[idx].kind { + LayerKind::Confined { + cpus_range, + util_range, + } + | LayerKind::Grouped { + cpus_range, + util_range, + .. + } => { + let load = ( + self.sched_stats.layer_loads[idx], + self.sched_stats.total_load, + ); + let util = ( + self.sched_stats.layer_utils[idx], + self.sched_stats.total_util, + ); + if self.layers[idx].resize_confined_or_grouped( + &mut self.cpu_pool, + cpus_range, + util_range, + load, + util, + self.no_load_frac_limit, + )? != 0 + { + Self::update_bpf_layer_cpumask( + &self.layers[idx], + &mut self.skel.bss_mut().layers[idx], + ); + updated = true; + } + } + _ => {} + } + } + + if updated { + let available_cpus = self.cpu_pool.available_cpus(); + let nr_available_cpus = available_cpus.count_ones(); + for idx in 0..self.layers.len() { + let layer = &mut self.layers[idx]; + let bpf_layer = &mut self.skel.bss_mut().layers[idx]; + match &layer.kind { + LayerKind::Open { .. } => { + layer.cpus.copy_from_bitslice(&available_cpus); + layer.nr_cpus = nr_available_cpus; + Self::update_bpf_layer_cpumask(layer, bpf_layer); + } + _ => {} + } + } + + self.skel.bss_mut().fallback_cpu = self.cpu_pool.fallback_cpu as u32; + + for (lidx, layer) in self.layers.iter().enumerate() { + self.nr_layer_cpus_min_max[lidx] = ( + self.nr_layer_cpus_min_max[lidx].0.min(layer.nr_cpus), + self.nr_layer_cpus_min_max[lidx].1.max(layer.nr_cpus), + ); + } + } + + self.processing_dur += Instant::now().duration_since(started_at); + Ok(()) + } + + fn report(&mut self) -> Result<()> { + let started_at = Instant::now(); + self.report_stats + .refresh(&mut self.skel, &self.proc_reader, started_at)?; + let stats = &self.report_stats; + + let processing_dur = self.processing_dur - self.prev_processing_dur; + self.prev_processing_dur = self.processing_dur; + + let lsum = |idx| stats.bpf_stats.lstats_sums[idx as usize]; + let total = lsum(bpf_intf::layer_stat_idx_LSTAT_LOCAL) + + lsum(bpf_intf::layer_stat_idx_LSTAT_GLOBAL); + let lsum_pct = |idx| { + if total != 0 { + lsum(idx) as f64 / total as f64 * 100.0 + } else { + 0.0 + } + }; + + info!( + "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms", + total, + lsum_pct(bpf_intf::layer_stat_idx_LSTAT_LOCAL), + lsum_pct(bpf_intf::layer_stat_idx_LSTAT_OPEN_IDLE), + lsum_pct(bpf_intf::layer_stat_idx_LSTAT_AFFN_VIOL), + stats.prev_bpf_stats.gstats + [bpf_intf::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize], + processing_dur.as_millis(), + ); + + info!( + "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}", + stats.cpu_busy * 100.0, + stats.total_util * 100.0, + stats.total_load, + self.cpu_pool.fallback_cpu, + ); + + let header_width = self + .layer_specs + .iter() + .map(|spec| spec.name.len()) + .max() + .unwrap() + .max(4); + + let calc_frac = |a, b| { + if b != 0.0 { a / b * 100.0 } else { 0.0 } + }; + + for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() { + let lstat = |sidx| stats.bpf_stats.lstats[lidx][sidx as usize]; + let ltotal = lstat(bpf_intf::layer_stat_idx_LSTAT_LOCAL) + + lstat(bpf_intf::layer_stat_idx_LSTAT_GLOBAL); + let lstat_pct = |sidx| { + if ltotal != 0 { + lstat(sidx) as f64 / ltotal as f64 * 100.0 + } else { + 0.0 + } + }; + + info!( + " {:) -> Result<()> { + let now = Instant::now(); + let mut next_sched_at = now + self.sched_intv; + let mut next_monitor_at = now + self.monitor_intv; + + while !shutdown.load(Ordering::Relaxed) && !UserExitInfo::exited(&self.skel.bss().uei)? { + let now = Instant::now(); + + if now >= next_sched_at { + self.step()?; + while next_sched_at < now { + next_sched_at += self.sched_intv; + } + } + + if now >= next_monitor_at { + self.report()?; + while next_monitor_at < now { + next_monitor_at += self.monitor_intv; + } + } + + std::thread::sleep( + next_sched_at + .min(next_monitor_at) + .duration_since(Instant::now()), + ); + } + + self.struct_ops.take(); + UserExitInfo::read(&self.skel.bss().uei)?.report() + } +} + +impl<'a> Drop for Scheduler<'a> { + fn drop(&mut self) { + if let Some(struct_ops) = self.struct_ops.take() { + drop(struct_ops); + } + } +} + +fn write_example_file(path: &str) -> Result<()> { + let example = LayerConfig { + specs: vec![ + LayerSpec { + name: "batch".into(), + comment: Some("tasks under system.slice or tasks with nice value > 0".into()), + matches: vec![ + vec![LayerMatch::CgroupPrefix("system.slice/".into())], + vec![LayerMatch::NiceAbove(0)], + ], + kind: LayerKind::Confined { + cpus_range: Some((0, 16)), + util_range: (0.8, 0.9), + }, + }, + LayerSpec { + name: "immediate".into(), + comment: Some("tasks under workload.slice with nice value < 0".into()), + matches: vec![vec![ + LayerMatch::CgroupPrefix("workload.slice/".into()), + LayerMatch::NiceBelow(0), + ]], + kind: LayerKind::Open { preempt: true }, + }, + LayerSpec { + name: "normal".into(), + comment: Some("the rest".into()), + matches: vec![vec![]], + kind: LayerKind::Grouped { + cpus_range: None, + util_range: (0.5, 0.6), + preempt: false, + }, + }, + ], + }; + + let mut f = fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(path)?; + Ok(f.write_all(serde_json::to_string_pretty(&example)?.as_bytes())?) +} + +fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> { + let nr_specs = specs.len(); + if nr_specs == 0 { + bail!("No layer spec"); + } + if nr_specs > MAX_LAYERS { + bail!("Too many layer specs"); + } + + for (idx, spec) in specs.iter().enumerate() { + if idx < nr_specs - 1 { + if spec.matches.len() == 0 { + bail!("Non-terminal spec {:?} has NULL matches", spec.name); + } + } else { + if spec.matches.len() != 1 || spec.matches[0].len() != 0 { + bail!("Terminal spec {:?} must have an empty match", spec.name); + } + } + + if spec.matches.len() > MAX_LAYER_MATCH_ORS { + bail!( + "Spec {:?} has too many ({}) OR match blocks", + spec.name, + spec.matches.len() + ); + } + + for (ands_idx, ands) in spec.matches.iter().enumerate() { + if ands.len() > NR_LAYER_MATCH_KINDS { + bail!( + "Spec {:?}'s {}th OR block has too many ({}) match conditions", + spec.name, + ands_idx, + ands.len() + ); + } + for one in ands.iter() { + match one { + LayerMatch::CgroupPrefix(prefix) => { + if prefix.len() > MAX_PATH { + bail!("Spec {:?} has too long a cgroup prefix", spec.name); + } + } + LayerMatch::CommPrefix(prefix) => { + if prefix.len() > MAX_COMM { + bail!("Spec {:?} has too long a comm prefix", spec.name); + } + } + _ => {} + } + } + } + + match spec.kind { + LayerKind::Confined { + cpus_range, + util_range, + } + | LayerKind::Grouped { + cpus_range, + util_range, + .. + } => { + if let Some((cpus_min, cpus_max)) = cpus_range { + if cpus_min > cpus_max { + bail!( + "Spec {:?} has invalid cpus_range({}, {})", + spec.name, + cpus_min, + cpus_max + ); + } + } + if util_range.0 >= util_range.1 { + bail!( + "Spec {:?} has invalid util_range ({}, {})", + spec.name, + util_range.0, + util_range.1 + ); + } + } + _ => {} + } + } + + Ok(()) +} + +fn main() -> Result<()> { + let opts = Opts::parse(); + + let llv = match opts.verbose { + 0 => simplelog::LevelFilter::Info, + 1 => simplelog::LevelFilter::Debug, + _ => simplelog::LevelFilter::Trace, + }; + let mut lcfg = simplelog::ConfigBuilder::new(); + lcfg.set_time_level(simplelog::LevelFilter::Error) + .set_location_level(simplelog::LevelFilter::Off) + .set_target_level(simplelog::LevelFilter::Off) + .set_thread_level(simplelog::LevelFilter::Off); + simplelog::TermLogger::init( + llv, + lcfg.build(), + simplelog::TerminalMode::Stderr, + simplelog::ColorChoice::Auto, + )?; + + debug!("opts={:?}", &opts); + + if let Some(path) = &opts.example { + write_example_file(path)?; + return Ok(()); + } + + let mut layer_config = LayerConfig { specs: vec![] }; + for (idx, input) in opts.specs.iter().enumerate() { + layer_config.specs.append( + &mut LayerSpec::parse(input) + .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?, + ); + } + + debug!("specs={}", serde_json::to_string_pretty(&layer_config)?); + verify_layer_specs(&layer_config.specs)?; + + let mut sched = Scheduler::init(&opts, layer_config.specs)?; + + let shutdown = Arc::new(AtomicBool::new(false)); + let shutdown_clone = shutdown.clone(); + ctrlc::set_handler(move || { + shutdown_clone.store(true, Ordering::Relaxed); + }) + .context("Error setting Ctrl-C handler")?; + + sched.run(shutdown) +} diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c new file mode 100644 index 0000000000000..9da53c4b3e634 --- /dev/null +++ b/tools/sched_ext/scx_pair.bpf.c @@ -0,0 +1,626 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext core-scheduler which always makes every sibling CPU pair + * execute from the same CPU cgroup. + * + * This scheduler is a minimal implementation and would need some form of + * priority handling both inside each cgroup and across the cgroups to be + * practically useful. + * + * Each CPU in the system is paired with exactly one other CPU, according to a + * "stride" value that can be specified when the BPF scheduler program is first + * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee + * that they will only ever schedule tasks that belong to the same CPU cgroup. + * + * Scheduler Initialization + * ------------------------ + * + * The scheduler BPF program is first initialized from user space, before it is + * enabled. During this initialization process, each CPU on the system is + * assigned several values that are constant throughout its runtime: + * + * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling + * decisions. Paired CPUs always schedule tasks from the same + * CPU cgroup, and synchronize with each other to guarantee + * that this constraint is not violated. + * 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access + * a struct pair_ctx object that is shared between the pair. + * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the + * pair. Each struct pair_ctx has an active_mask field, + * which is a bitmap used to indicate whether each core + * in the pair currently has an actively running task. + * This index specifies which entry in the bitmap corresponds + * to each CPU in the pair. + * + * During this initialization, the CPUs are paired according to a "stride" that + * may be specified when invoking the user space program that initializes and + * loads the scheduler. By default, the stride is 1/2 the total number of CPUs. + * + * Tasks and cgroups + * ----------------- + * + * Every cgroup in the system is registered with the scheduler using the + * pair_cgroup_init() callback, and every task in the system is associated with + * exactly one cgroup. At a high level, the idea with the pair scheduler is to + * always schedule tasks from the same cgroup within a given CPU pair. When a + * task is enqueued (i.e. passed to the pair_enqueue() callback function), its + * cgroup ID is read from its task struct, and then a corresponding queue map + * is used to FIFO-enqueue the task for that cgroup. + * + * If you look through the implementation of the scheduler, you'll notice that + * there is quite a bit of complexity involved with looking up the per-cgroup + * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash + * BPF hash map that is used to map a cgroup ID to a globally unique ID that's + * allocated in the BPF program. This is done because we use separate maps to + * store the FIFO queue of tasks, and the length of that map, per cgroup. This + * complexity is only present because of current deficiencies in BPF that will + * soon be addressed. The main point to keep in mind is that newly enqueued + * tasks are added to their cgroup's FIFO queue. + * + * Dispatching tasks + * ----------------- + * + * This section will describe how enqueued tasks are dispatched and scheduled. + * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is + * as follows: + * + * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is + * the structure that's used to synchronize amongst the two pair CPUs in their + * scheduling decisions. After any of the following events have occurred: + * + * - The cgroup's slice run has expired, or + * - The cgroup becomes empty, or + * - Either CPU in the pair is preempted by a higher priority scheduling class + * + * The cgroup transitions to the draining state and stops executing new tasks + * from the cgroup. + * + * 2. If the pair is still executing a task, mark the pair_ctx as draining, and + * wait for the pair CPU to be preempted. + * + * 3. Otherwise, if the pair CPU is not running a task, we can move onto + * scheduling new tasks. Pop the next cgroup id from the top_q queue. + * + * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it. + * + * Note again that this scheduling behavior is simple, but the implementation + * is complex mostly because this it hits several BPF shortcomings and has to + * work around in often awkward ways. Most of the shortcomings are expected to + * be resolved in the near future which should allow greatly simplifying this + * scheduler. + * + * Dealing with preemption + * ----------------------- + * + * SCX is the lowest priority sched_class, and could be preempted by them at + * any time. To address this, the scheduler implements pair_cpu_release() and + * pair_cpu_acquire() callbacks which are invoked by the core scheduler when + * the scheduler loses and gains control of the CPU respectively. + * + * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and + * then invoke: + * + * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT); + * + * This preempts the pair CPU, and waits until it has re-entered the scheduler + * before returning. This is necessary to ensure that the higher priority + * sched_class that preempted our scheduler does not schedule a task + * concurrently with our pair CPU. + * + * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption + * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable + * pair scheduling. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include "scx_pair.h" + +char _license[] SEC("license") = "GPL"; + +const volatile bool switch_partial; + +/* !0 for veristat, set during init */ +const volatile u32 nr_cpu_ids = 1; + +/* a pair of CPUs stay on a cgroup for this duration */ +const volatile u32 pair_batch_dur_ns = SCX_SLICE_DFL; + +/* cpu ID -> pair cpu ID */ +const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu); + +/* cpu ID -> pair_id */ +const volatile u32 RESIZABLE_ARRAY(rodata, pair_id); + +/* CPU ID -> CPU # in the pair (0 or 1) */ +const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx); + +struct pair_ctx { + struct bpf_spin_lock lock; + + /* the cgroup the pair is currently executing */ + u64 cgid; + + /* the pair started executing the current cgroup at */ + u64 started_at; + + /* whether the current cgroup is draining */ + bool draining; + + /* the CPUs that are currently active on the cgroup */ + u32 active_mask; + + /* + * the CPUs that are currently preempted and running tasks in a + * different scheduler. + */ + u32 preempted_mask; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct pair_ctx); +} pair_ctx SEC(".maps"); + +/* queue of cgrp_q's possibly with tasks on them */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + /* + * Because it's difficult to build strong synchronization encompassing + * multiple non-trivial operations in BPF, this queue is managed in an + * opportunistic way so that we guarantee that a cgroup w/ active tasks + * is always on it but possibly multiple times. Once we have more robust + * synchronization constructs and e.g. linked list, we should be able to + * do this in a prettier way but for now just size it big enough. + */ + __uint(max_entries, 4 * MAX_CGRPS); + __type(value, u64); +} top_q SEC(".maps"); + +/* per-cgroup q which FIFOs the tasks from the cgroup */ +struct cgrp_q { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_QUEUED); + __type(value, u32); +}; + +/* + * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local + * storage; however, a cgroup local storage can only be accessed from the BPF + * progs attached to the cgroup. For now, work around by allocating array of + * cgrp_q's and then allocating per-cgroup indices. + * + * Another caveat: It's difficult to populate a large array of maps statically + * or from BPF. Initialize it from userland. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, MAX_CGRPS); + __type(key, s32); + __array(values, struct cgrp_q); +} cgrp_q_arr SEC(".maps"); + +static u64 cgrp_q_len[MAX_CGRPS]; + +/* + * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be + * useful to have as a map type. + */ +static u32 cgrp_q_idx_cursor; +static u64 cgrp_q_idx_busy[MAX_CGRPS]; + +/* + * All added up, the following is what we do: + * + * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking + * for a free ID. If not found, fail cgroup creation with -EBUSY. + * + * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following + * cgrp_q_idx_hash. + * + * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from + * cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr. + * + * This is sadly complicated for something pretty simple. Hopefully, we should + * be able to simplify in the future. + */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_CGRPS); + __uint(key_size, sizeof(u64)); /* cgrp ID */ + __uint(value_size, sizeof(s32)); /* cgrp_q idx */ +} cgrp_q_idx_hash SEC(".maps"); + +/* statistics */ +u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions; +u64 nr_exps, nr_exp_waits, nr_exp_empty; +u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty; + +struct user_exit_info uei; + +static bool time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct cgroup *cgrp; + struct cgrp_q *cgq; + s32 pid = p->pid; + u64 cgid; + u32 *q_idx; + u64 *cgq_len; + + __sync_fetch_and_add(&nr_total, 1); + + cgrp = scx_bpf_task_cgroup(p); + cgid = cgrp->kn->id; + bpf_cgroup_release(cgrp); + + /* find the cgroup's q and push @p into it */ + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (!q_idx) { + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); + return; + } + + cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx); + if (!cgq) { + scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]", + cgid, *q_idx); + return; + } + + if (bpf_map_push_elem(cgq, &pid, 0)) { + scx_bpf_error("cgroup[%llu] queue overflow", cgid); + return; + } + + /* bump q len, if going 0 -> 1, queue cgroup into the top_q */ + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); + if (!cgq_len) { + scx_bpf_error("MEMBER_VTPR malfunction"); + return; + } + + if (!__sync_fetch_and_add(cgq_len, 1) && + bpf_map_push_elem(&top_q, &cgid, 0)) { + scx_bpf_error("top_q overflow"); + return; + } +} + +static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask) +{ + u32 *vptr; + + vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids); + if (!vptr) + return -EINVAL; + + *pairc = bpf_map_lookup_elem(&pair_ctx, vptr); + if (!(*pairc)) + return -EINVAL; + + vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids); + if (!vptr) + return -EINVAL; + + *mask = 1U << *vptr; + + return 0; +} + +static int try_dispatch(s32 cpu) +{ + struct pair_ctx *pairc; + struct bpf_map *cgq_map; + struct task_struct *p; + u64 now = bpf_ktime_get_ns(); + bool kick_pair = false; + bool expired, pair_preempted; + u32 *vptr, in_pair_mask; + s32 pid, q_idx; + u64 cgid; + int ret; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) { + scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]", + cpu); + return -ENOENT; + } + + bpf_spin_lock(&pairc->lock); + pairc->active_mask &= ~in_pair_mask; + + expired = time_before(pairc->started_at + pair_batch_dur_ns, now); + if (expired || pairc->draining) { + u64 new_cgid = 0; + + __sync_fetch_and_add(&nr_exps, 1); + + /* + * We're done with the current cgid. An obvious optimization + * would be not draining if the next cgroup is the current one. + * For now, be dumb and always expire. + */ + pairc->draining = true; + + pair_preempted = pairc->preempted_mask; + if (pairc->active_mask || pair_preempted) { + /* + * The other CPU is still active, or is no longer under + * our control due to e.g. being preempted by a higher + * priority sched_class. We want to wait until this + * cgroup expires, or until control of our pair CPU has + * been returned to us. + * + * If the pair controls its CPU, and the time already + * expired, kick. When the other CPU arrives at + * dispatch and clears its active mask, it'll push the + * pair to the next cgroup and kick this CPU. + */ + __sync_fetch_and_add(&nr_exp_waits, 1); + bpf_spin_unlock(&pairc->lock); + if (expired && !pair_preempted) + kick_pair = true; + goto out_maybe_kick; + } + + bpf_spin_unlock(&pairc->lock); + + /* + * Pick the next cgroup. It'd be easier / cleaner to not drop + * pairc->lock and use stronger synchronization here especially + * given that we'll be switching cgroups significantly less + * frequently than tasks. Unfortunately, bpf_spin_lock can't + * really protect anything non-trivial. Let's do opportunistic + * operations instead. + */ + bpf_repeat(BPF_MAX_LOOPS) { + u32 *q_idx; + u64 *cgq_len; + + if (bpf_map_pop_elem(&top_q, &new_cgid)) { + /* no active cgroup, go idle */ + __sync_fetch_and_add(&nr_exp_empty, 1); + return 0; + } + + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid); + if (!q_idx) + continue; + + /* + * This is the only place where empty cgroups are taken + * off the top_q. + */ + cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); + if (!cgq_len || !*cgq_len) + continue; + + /* + * If it has any tasks, requeue as we may race and not + * execute it. + */ + bpf_map_push_elem(&top_q, &new_cgid, 0); + break; + } + + bpf_spin_lock(&pairc->lock); + + /* + * The other CPU may already have started on a new cgroup while + * we dropped the lock. Make sure that we're still draining and + * start on the new cgroup. + */ + if (pairc->draining && !pairc->active_mask) { + __sync_fetch_and_add(&nr_cgrp_next, 1); + pairc->cgid = new_cgid; + pairc->started_at = now; + pairc->draining = false; + kick_pair = true; + } else { + __sync_fetch_and_add(&nr_cgrp_coll, 1); + } + } + + cgid = pairc->cgid; + pairc->active_mask |= in_pair_mask; + bpf_spin_unlock(&pairc->lock); + + /* again, it'd be better to do all these with the lock held, oh well */ + vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (!vptr) { + scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); + return -ENOENT; + } + q_idx = *vptr; + + /* claim one task from cgrp_q w/ q_idx */ + bpf_repeat(BPF_MAX_LOOPS) { + u64 *cgq_len, len; + + cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]); + if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) { + /* the cgroup must be empty, expire and repeat */ + __sync_fetch_and_add(&nr_cgrp_empty, 1); + bpf_spin_lock(&pairc->lock); + pairc->draining = true; + pairc->active_mask &= ~in_pair_mask; + bpf_spin_unlock(&pairc->lock); + return -EAGAIN; + } + + if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len) + continue; + + break; + } + + cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx); + if (!cgq_map) { + scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]", + cgid, q_idx); + return -ENOENT; + } + + if (bpf_map_pop_elem(cgq_map, &pid)) { + scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]", + cgid, q_idx); + return -ENOENT; + } + + p = bpf_task_from_pid(pid); + if (p) { + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } else { + /* we don't handle dequeues, retry on lost tasks */ + __sync_fetch_and_add(&nr_missing, 1); + return -EAGAIN; + } + +out_maybe_kick: + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); + } + } + return 0; +} + +void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev) +{ + bpf_repeat(BPF_MAX_LOOPS) { + if (try_dispatch(cpu) != -EAGAIN) + break; + } +} + +void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args) +{ + int ret; + u32 in_pair_mask; + struct pair_ctx *pairc; + bool kick_pair; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) + return; + + bpf_spin_lock(&pairc->lock); + pairc->preempted_mask &= ~in_pair_mask; + /* Kick the pair CPU, unless it was also preempted. */ + kick_pair = !pairc->preempted_mask; + bpf_spin_unlock(&pairc->lock); + + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); + } + } +} + +void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + int ret; + u32 in_pair_mask; + struct pair_ctx *pairc; + bool kick_pair; + + ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); + if (ret) + return; + + bpf_spin_lock(&pairc->lock); + pairc->preempted_mask |= in_pair_mask; + pairc->active_mask &= ~in_pair_mask; + /* Kick the pair CPU if it's still running. */ + kick_pair = pairc->active_mask; + pairc->draining = true; + bpf_spin_unlock(&pairc->lock); + + if (kick_pair) { + s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); + + if (pair) { + __sync_fetch_and_add(&nr_kicks, 1); + scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT); + } + } + __sync_fetch_and_add(&nr_preemptions, 1); +} + +s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + s32 i, q_idx; + + bpf_for(i, 0, MAX_CGRPS) { + q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS; + if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) + break; + } + if (i == MAX_CGRPS) + return -EBUSY; + + if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) { + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]); + if (busy) + *busy = 0; + return -EBUSY; + } + + return 0; +} + +void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp) +{ + u64 cgid = cgrp->kn->id; + s32 *q_idx; + + q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); + if (q_idx) { + u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]); + if (busy) + *busy = 0; + bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid); + } +} + +s32 BPF_STRUCT_OPS(pair_init) +{ + if (!switch_partial) + scx_bpf_switch_all(); + return 0; +} + +void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops pair_ops = { + .enqueue = (void *)pair_enqueue, + .dispatch = (void *)pair_dispatch, + .cpu_acquire = (void *)pair_cpu_acquire, + .cpu_release = (void *)pair_cpu_release, + .cgroup_init = (void *)pair_cgroup_init, + .cgroup_exit = (void *)pair_cgroup_exit, + .init = (void *)pair_init, + .exit = (void *)pair_exit, + .name = "pair", +}; diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c new file mode 100644 index 0000000000000..1eb30efeb0ed5 --- /dev/null +++ b/tools/sched_ext/scx_pair.c @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include "scx_pair.h" +#include "scx_pair.bpf.skel.h" + +const char help_fmt[] = +"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n" +"execute from the same CPU cgroup.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-S STRIDE] [-p]\n" +"\n" +" -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_pair *skel; + struct bpf_link *link; + __u64 seq = 0; + __s32 stride, i, opt, outer_fd; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_pair__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); + + /* pair up the earlier half to the latter by default, override with -s */ + stride = skel->rodata->nr_cpu_ids / 2; + + while ((opt = getopt(argc, argv, "S:ph")) != -1) { + switch (opt) { + case 'S': + stride = strtoul(optarg, NULL, 0); + break; + case 'p': + skel->rodata->switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2); + + /* Resize arrays so their element count is equal to cpu count. */ + RESIZE_ARRAY(rodata, pair_cpu, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(rodata, pair_id, skel->rodata->nr_cpu_ids); + RESIZE_ARRAY(rodata, in_pair_idx, skel->rodata->nr_cpu_ids); + + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) + skel->rodata_pair_cpu->pair_cpu[i] = -1; + + printf("Pairs: "); + for (i = 0; i < skel->rodata->nr_cpu_ids; i++) { + int j = (i + stride) % skel->rodata->nr_cpu_ids; + + if (skel->rodata_pair_cpu->pair_cpu[i] >= 0) + continue; + + SCX_BUG_ON(i == j, + "Invalid stride %d - CPU%d wants to be its own pair", + stride, i); + + SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0, + "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair", + stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]); + + skel->rodata_pair_cpu->pair_cpu[i] = j; + skel->rodata_pair_cpu->pair_cpu[j] = i; + skel->rodata_pair_id->pair_id[i] = i; + skel->rodata_pair_id->pair_id[j] = i; + skel->rodata_in_pair_idx->in_pair_idx[i] = 0; + skel->rodata_in_pair_idx->in_pair_idx[j] = 1; + + printf("[%d, %d] ", i, j); + } + printf("\n"); + + SCX_BUG_ON(scx_pair__load(skel), "Failed to load skel"); + + /* + * Populate the cgrp_q_arr map which is an array containing per-cgroup + * queues. It'd probably be better to do this from BPF but there are too + * many to initialize statically and there's no way to dynamically + * populate from BPF. + */ + outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr); + SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd); + + printf("Initializing"); + for (i = 0; i < MAX_CGRPS; i++) { + __s32 inner_fd; + + if (exit_req) + break; + + inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, + sizeof(__u32), MAX_QUEUED, NULL); + SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d", + inner_fd); + SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY), + "Failed to set inner map"); + close(inner_fd); + + if (!(i % 10)) + printf("."); + fflush(stdout); + } + printf("\n"); + + /* + * Fully initialized, attach and run. + */ + link = bpf_map__attach_struct_ops(skel->maps.pair_ops); + SCX_BUG_ON(!link, "Failed to attach struct_ops"); + + while (!exit_req && !uei_exited(&skel->bss->uei)) { + printf("[SEQ %llu]\n", seq++); + printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n", + skel->bss->nr_total, + skel->bss->nr_dispatched, + skel->bss->nr_missing); + printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n", + skel->bss->nr_kicks, + skel->bss->nr_preemptions); + printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n", + skel->bss->nr_exps, + skel->bss->nr_exp_waits, + skel->bss->nr_exp_empty); + printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n", + skel->bss->nr_cgrp_next, + skel->bss->nr_cgrp_coll, + skel->bss->nr_cgrp_empty); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + uei_print(&skel->bss->uei); + scx_pair__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h new file mode 100644 index 0000000000000..d9666a447d3fd --- /dev/null +++ b/tools/sched_ext/scx_pair.h @@ -0,0 +1,9 @@ +#ifndef __SCX_EXAMPLE_PAIR_H +#define __SCX_EXAMPLE_PAIR_H + +enum { + MAX_QUEUED = 4096, + MAX_CGRPS = 4096, +}; + +#endif /* __SCX_EXAMPLE_PAIR_H */ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c new file mode 100644 index 0000000000000..2fb75543a1640 --- /dev/null +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -0,0 +1,400 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple five-level FIFO queue scheduler. + * + * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets + * assigned to one depending on its compound weight. Each CPU round robins + * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from + * queue0, 2 from queue1, 4 from queue2 and so on. + * + * This scheduler demonstrates: + * + * - BPF-side queueing using PIDs. + * - Sleepable per-task storage allocation using ops.prep_enable(). + * - Using ops.cpu_release() to handle a higher priority scheduling class taking + * the CPU away. + * - Core-sched support. + * + * This scheduler is primarily for demonstration and testing of sched_ext + * features and unlikely to be useful for actual workloads. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +const volatile u64 slice_ns = SCX_SLICE_DFL; +const volatile bool switch_partial; +const volatile u32 stall_user_nth; +const volatile u32 stall_kernel_nth; +const volatile u32 dsp_inf_loop_after; +const volatile s32 disallow_tgid; + +u32 test_error_cnt; + +struct user_exit_info uei; + +struct qmap { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 4096); + __type(value, u32); +} queue0 SEC(".maps"), + queue1 SEC(".maps"), + queue2 SEC(".maps"), + queue3 SEC(".maps"), + queue4 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 5); + __type(key, int); + __array(values, struct qmap); +} queue_arr SEC(".maps") = { + .values = { + [0] = &queue0, + [1] = &queue1, + [2] = &queue2, + [3] = &queue3, + [4] = &queue4, + }, +}; + +/* + * Per-queue sequence numbers to implement core-sched ordering. + * + * Tail seq is assigned to each queued task and incremented. Head seq tracks the + * sequence number of the latest dispatched task. The distance between the a + * task's seq and the associated queue's head seq is called the queue distance + * and used when comparing two tasks for ordering. See qmap_core_sched_before(). + */ +static u64 core_sched_head_seqs[5]; +static u64 core_sched_tail_seqs[5]; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local_dsq */ + u64 core_sched_seq; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Per-cpu dispatch index and remaining count */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, u32); + __type(value, u64); +} dispatch_idx_cnt SEC(".maps"); + +/* Statistics */ +u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; +u64 nr_core_sched_execed; + +s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + return cpu; + + return prev_cpu; +} + +static int weight_to_idx(u32 weight) +{ + /* Coarsely map the compound weight to a FIFO. */ + if (weight <= 25) + return 0; + else if (weight <= 50) + return 1; + else if (weight < 200) + return 2; + else if (weight < 400) + return 3; + else + return 4; +} + +void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) +{ + static u32 user_cnt, kernel_cnt; + struct task_ctx *tctx; + u32 pid = p->pid; + int idx = weight_to_idx(p->scx.weight); + void *ring; + + if (p->flags & PF_KTHREAD) { + if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) + return; + } else { + if (stall_user_nth && !(++user_cnt % stall_user_nth)) + return; + } + + if (test_error_cnt && !--test_error_cnt) + scx_bpf_error("test triggering error"); + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + /* + * All enqueued tasks must have their core_sched_seq updated for correct + * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in + * qmap_ops.flags. + */ + tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + + /* + * If qmap_select_cpu() is telling us to or this is the last runnable + * task on the CPU, enqueue locally. + */ + if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { + tctx->force_local = false; + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + + /* + * If the task was re-enqueued due to the CPU being preempted by a + * higher priority scheduling class, just re-enqueue the task directly + * on the global DSQ. As we want another CPU to pick it up, find and + * kick an idle CPU. + */ + if (enq_flags & SCX_ENQ_REENQ) { + s32 cpu; + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags); + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + scx_bpf_kick_cpu(cpu, 0); + return; + } + + ring = bpf_map_lookup_elem(&queue_arr, &idx); + if (!ring) { + scx_bpf_error("failed to find ring %d", idx); + return; + } + + /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ + if (bpf_map_push_elem(ring, &pid, 0)) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags); + return; + } + + __sync_fetch_and_add(&nr_enqueued, 1); +} + +/* + * The BPF queue map doesn't support removal and sched_ext can handle spurious + * dispatches. qmap_dequeue() is only used to collect statistics. + */ +void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) +{ + __sync_fetch_and_add(&nr_dequeued, 1); + if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) + __sync_fetch_and_add(&nr_core_sched_execed, 1); +} + +static void update_core_sched_head_seq(struct task_struct *p) +{ + struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); + + if (tctx) + core_sched_head_seqs[idx] = tctx->core_sched_seq; + else + scx_bpf_error("task_ctx lookup failed"); +} + +void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) +{ + u32 zero = 0, one = 1; + u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero); + u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one); + void *fifo; + s32 pid; + int i; + + if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { + struct task_struct *p; + + /* + * PID 2 should be kthreadd which should mostly be idle and off + * the scheduler. Let's keep dispatching it to force the kernel + * to call this function over and over again. + */ + p = bpf_task_from_pid(2); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + + if (!idx || !cnt) { + scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt); + return; + } + + for (i = 0; i < 5; i++) { + /* Advance the dispatch cursor and pick the fifo. */ + if (!*cnt) { + *idx = (*idx + 1) % 5; + *cnt = 1 << *idx; + } + (*cnt)--; + + fifo = bpf_map_lookup_elem(&queue_arr, idx); + if (!fifo) { + scx_bpf_error("failed to find ring %llu", *idx); + return; + } + + /* Dispatch or advance. */ + if (!bpf_map_pop_elem(fifo, &pid)) { + struct task_struct *p; + + p = bpf_task_from_pid(pid); + if (p) { + update_core_sched_head_seq(p); + __sync_fetch_and_add(&nr_dispatched, 1); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); + bpf_task_release(p); + return; + } + } + + *cnt = 0; + } +} + +/* + * The distance from the head of the queue scaled by the weight of the queue. + * The lower the number, the older the task and the higher the priority. + */ +static s64 task_qdist(struct task_struct *p) +{ + int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; + s64 qdist; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return 0; + } + + qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + + /* + * As queue index increments, the priority doubles. The queue w/ index 3 + * is dispatched twice more frequently than 2. Reflect the difference by + * scaling qdists accordingly. Note that the shift amount needs to be + * flipped depending on the sign to avoid flipping priority direction. + */ + if (qdist >= 0) + return qdist << (4 - idx); + else + return qdist << idx; +} + +/* + * This is called to determine the task ordering when core-sched is picking + * tasks to execute on SMT siblings and should encode about the same ordering as + * the regular scheduling path. Use the priority-scaled distances from the head + * of the queues to compare the two tasks which should be consistent with the + * dispatch path behavior. + */ +bool BPF_STRUCT_OPS(qmap_core_sched_before, + struct task_struct *a, struct task_struct *b) +{ + return task_qdist(a) > task_qdist(b); +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + u32 cnt; + + /* + * Called when @cpu is taken by a higher priority scheduling class. This + * makes @cpu no longer available for executing sched_ext tasks. As we + * don't want the tasks in @cpu's local dsq to sit there until @cpu + * becomes available again, re-enqueue them into the global dsq. See + * %SCX_ENQ_REENQ handling in qmap_enqueue(). + */ + cnt = scx_bpf_reenqueue_local(); + if (cnt) + __sync_fetch_and_add(&nr_reenqueued, cnt); +} + +s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (p->tgid == disallow_tgid) + p->scx.disallow = true; + + /* + * @p is new. Let's ensure that its task_ctx is available. We can sleep + * in this function and the following will automatically use GFP_KERNEL. + */ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(qmap_init) +{ + if (!switch_partial) + scx_bpf_switch_all(); + return 0; +} + +void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops qmap_ops = { + .select_cpu = (void *)qmap_select_cpu, + .enqueue = (void *)qmap_enqueue, + .dequeue = (void *)qmap_dequeue, + .dispatch = (void *)qmap_dispatch, + .core_sched_before = (void *)qmap_core_sched_before, + .cpu_release = (void *)qmap_cpu_release, + .init_task = (void *)qmap_init_task, + .init = (void *)qmap_init, + .exit = (void *)qmap_exit, + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 5000U, + .name = "qmap", +}; diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c new file mode 100644 index 0000000000000..7008b91386449 --- /dev/null +++ b/tools/sched_ext/scx_qmap.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "scx_qmap.bpf.skel.h" + +const char help_fmt[] = +"A simple five-level FIFO queue sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID] [-p]\n" +"\n" +" -s SLICE_US Override slice duration\n" +" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" +" -t COUNT Stall every COUNT'th user thread\n" +" -T COUNT Stall every COUNT'th kernel thread\n" +" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" +" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int dummy) +{ + exit_req = 1; +} + +int main(int argc, char **argv) +{ + struct scx_qmap *skel; + struct bpf_link *link; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_qmap__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) { + switch (opt) { + case 's': + skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; + break; + case 'e': + skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); + break; + case 't': + skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); + break; + case 'T': + skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); + break; + case 'l': + skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); + break; + case 'd': + skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); + if (skel->rodata->disallow_tgid < 0) + skel->rodata->disallow_tgid = getpid(); + break; + case 'p': + skel->rodata->switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_BUG_ON(scx_qmap__load(skel), "Failed to load skel"); + + link = bpf_map__attach_struct_ops(skel->maps.qmap_ops); + SCX_BUG_ON(!link, "Failed to attach struct_ops"); + + while (!exit_req && !uei_exited(&skel->bss->uei)) { + long nr_enqueued = skel->bss->nr_enqueued; + long nr_dispatched = skel->bss->nr_dispatched; + + printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n", + nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, + skel->bss->nr_reenqueued, skel->bss->nr_dequeued, + skel->bss->nr_core_sched_execed); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + uei_print(&skel->bss->uei); + scx_qmap__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_rusty/.gitignore b/tools/sched_ext/scx_rusty/.gitignore new file mode 100644 index 0000000000000..186dba259ec21 --- /dev/null +++ b/tools/sched_ext/scx_rusty/.gitignore @@ -0,0 +1,3 @@ +src/bpf/.output +Cargo.lock +target diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml new file mode 100644 index 0000000000000..a8b4231d1bde9 --- /dev/null +++ b/tools/sched_ext/scx_rusty/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "scx_rusty" +version = "0.5.3" +authors = ["Dan Schatzberg ", "Meta"] +edition = "2021" +description = "Userspace scheduling with BPF" +license = "GPL-2.0-only" + +[dependencies] +anyhow = "1.0.65" +bitvec = { version = "1.0", features = ["serde"] } +clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } +ctrlc = { version = "3.1", features = ["termination"] } +fb_procfs = "0.7.0" +hex = "0.4.3" +libbpf-rs = "0.22.0" +libc = "0.2.137" +log = "0.4.17" +ordered-float = "3.4.0" +scx_utils = "0.5" +simplelog = "0.12.0" + +[build-dependencies] +scx_utils = "0.5" + +[features] +enable_backtrace = [] diff --git a/tools/sched_ext/scx_rusty/README.md b/tools/sched_ext/scx_rusty/README.md new file mode 100644 index 0000000000000..990e51aaf43b3 --- /dev/null +++ b/tools/sched_ext/scx_rusty/README.md @@ -0,0 +1,36 @@ +# scx_rusty + +This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main). + +## Overview + +A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the +scheduler does a simple round robin in each domain, and the user space portion +(written in Rust) calculates the load factor of each domain, and informs BPF of +how tasks should be load balanced accordingly. + +## How To Install + +Available as a [Rust crate](https://crates.io/crates/scx_rusty): `cargo add scx_rusty` + +## Typical Use Case + +Rusty is designed to be flexible, and accommodate different architectures and +workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc), +as well as how Rusty should partition the system into scheduling domains, can +be tuned to achieve the optimal configuration for any given system or workload. + +## Production Ready? + +Yes. If tuned correctly, rusty should be performant across various CPU +architectures and workloads. Rusty by default creates a separate scheduling +domain per-LLC, so its default configuration may be performant as well. Note +however that scx_rusty does not yet disambiguate between LLCs in different NUMA +nodes, so it may perform better on multi-CCX machines where all the LLCs share +the same socket, as opposed to multi-socket machines. + +Note as well that you may run into an issue with infeasible weights, where a +task with a very high weight may cause the scheduler to incorrectly leave cores +idle because it thinks they're necessary to accommodate the compute for a +single task. This can also happen in CFS, and should soon be addressed for +scx_rusty. diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs new file mode 100644 index 0000000000000..d26db839cd9e1 --- /dev/null +++ b/tools/sched_ext/scx_rusty/build.rs @@ -0,0 +1,13 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +fn main() { + scx_utils::BpfBuilder::new() + .unwrap() + .enable_intf("src/bpf/intf.h", "bpf_intf.rs") + .enable_skel("src/bpf/main.bpf.c", "bpf") + .build() + .unwrap(); +} diff --git a/tools/sched_ext/scx_rusty/rustfmt.toml b/tools/sched_ext/scx_rusty/rustfmt.toml new file mode 100644 index 0000000000000..b7258ed0a8d84 --- /dev/null +++ b/tools/sched_ext/scx_rusty/rustfmt.toml @@ -0,0 +1,8 @@ +# Get help on options with `rustfmt --help=config` +# Please keep these in alphabetical order. +edition = "2021" +group_imports = "StdExternalCrate" +imports_granularity = "Item" +merge_derives = false +use_field_init_shorthand = true +version = "Two" diff --git a/tools/sched_ext/scx_rusty/src/bpf/intf.h b/tools/sched_ext/scx_rusty/src/bpf/intf.h new file mode 100644 index 0000000000000..f295695102051 --- /dev/null +++ b/tools/sched_ext/scx_rusty/src/bpf/intf.h @@ -0,0 +1,97 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +#ifndef __INTF_H +#define __INTF_H + +#include +#ifndef __kptr +#ifdef __KERNEL__ +#error "__kptr_ref not defined in the kernel" +#endif +#define __kptr +#endif + +#ifndef __KERNEL__ +typedef unsigned char u8; +typedef unsigned int u32; +typedef unsigned long long u64; +#endif + +#include + +enum consts { + MAX_CPUS = 512, + MAX_DOMS = 64, /* limited to avoid complex bitmask ops */ + CACHELINE_SIZE = 64, + + /* + * When userspace load balancer is trying to determine the tasks to push + * out from an overloaded domain, it looks at the the following number + * of recently active tasks of the domain. While this may lead to + * spurious migration victim selection failures in pathological cases, + * this isn't a practical problem as the LB rounds are best-effort + * anyway and will be retried until loads are balanced. + */ + MAX_DOM_ACTIVE_PIDS = 1024, +}; + +/* Statistics */ +enum stat_idx { + /* The following fields add up to all dispatched tasks */ + RUSTY_STAT_WAKE_SYNC, + RUSTY_STAT_PREV_IDLE, + RUSTY_STAT_GREEDY_IDLE, + RUSTY_STAT_PINNED, + RUSTY_STAT_DIRECT_DISPATCH, + RUSTY_STAT_DIRECT_GREEDY, + RUSTY_STAT_DIRECT_GREEDY_FAR, + RUSTY_STAT_DSQ_DISPATCH, + RUSTY_STAT_GREEDY, + + /* Extra stats that don't contribute to total */ + RUSTY_STAT_REPATRIATE, + RUSTY_STAT_KICK_GREEDY, + RUSTY_STAT_LOAD_BALANCE, + + /* Errors */ + RUSTY_STAT_TASK_GET_ERR, + + RUSTY_NR_STATS, +}; + +struct task_ctx { + /* The domains this task can run on */ + u64 dom_mask; + + struct bpf_cpumask __kptr *cpumask; + u32 dom_id; + u32 weight; + bool runnable; + u64 dom_active_pids_gen; + u64 running_at; + + /* The task is a workqueue worker thread */ + bool is_kworker; + + /* Allowed on all CPUs and eligible for DIRECT_GREEDY optimization */ + bool all_cpus; + + /* select_cpu() telling enqueue() to queue directly on the DSQ */ + bool dispatch_local; + + struct ravg_data dcyc_rd; +}; + +struct dom_ctx { + u64 vtime_now; + struct bpf_cpumask __kptr *cpumask; + struct bpf_cpumask __kptr *direct_greedy_cpumask; + + u64 load; + struct ravg_data load_rd; + u64 dbg_load_printed_at; +}; + +#endif /* __INTF_H */ diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c new file mode 100644 index 0000000000000..fe4de979f2a2d --- /dev/null +++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c @@ -0,0 +1,1164 @@ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ +/* + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + * + * scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF + * part does simple round robin in each domain and the userspace part + * calculates the load factor of each domain and tells the BPF part how to load + * balance the domains. + * + * Every task has an entry in the task_data map which lists which domain the + * task belongs to. When a task first enters the system (rusty_prep_enable), + * they are round-robined to a domain. + * + * rusty_select_cpu is the primary scheduling logic, invoked when a task + * becomes runnable. The lb_data map is populated by userspace to inform the BPF + * scheduler that a task should be migrated to a new domain. Otherwise, the task + * is scheduled in priority order as follows: + * * The current core if the task was woken up synchronously and there are idle + * cpus in the system + * * The previous core, if idle + * * The pinned-to core if the task is pinned to a specific core + * * Any idle cpu in the domain + * + * If none of the above conditions are met, then the task is enqueued to a + * dispatch queue corresponding to the domain (rusty_enqueue). + * + * rusty_dispatch will attempt to consume a task from its domain's + * corresponding dispatch queue (this occurs after scheduling any tasks directly + * assigned to it due to the logic in rusty_select_cpu). If no task is found, + * then greedy load stealing will attempt to find a task on another dispatch + * queue to run. + * + * Load balancing is almost entirely handled by userspace. BPF populates the + * task weight, dom mask and current dom in the task_data map and executes the + * load balance based on userspace populating the lb_data map. + */ +#include +#include +#include "intf.h" + +#include +#include +#include +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +/* + * const volatiles are set during initialization and treated as consts by the + * jit compiler. + */ + +/* + * Domains and cpus + */ +const volatile u32 nr_doms = 32; /* !0 for veristat, set during init */ +const volatile u32 nr_cpus = 64; /* !0 for veristat, set during init */ +const volatile u32 cpu_dom_id_map[MAX_CPUS]; +const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64]; +const volatile u32 load_half_life = 1000000000 /* 1s */; + +const volatile bool kthreads_local; +const volatile bool fifo_sched; +const volatile bool switch_partial; +const volatile u32 greedy_threshold; +const volatile u32 debug; + +/* base slice duration */ +const volatile u64 slice_ns = SCX_SLICE_DFL; + +/* + * Exit info + */ +int exit_kind = SCX_EXIT_NONE; +char exit_msg[SCX_EXIT_MSG_LEN]; + +/* + * Per-CPU context + */ +struct pcpu_ctx { + u32 dom_rr_cur; /* used when scanning other doms */ + + /* libbpf-rs does not respect the alignment, so pad out the struct explicitly */ + u8 _padding[CACHELINE_SIZE - sizeof(u32)]; +} __attribute__((aligned(CACHELINE_SIZE))); + +struct pcpu_ctx pcpu_ctx[MAX_CPUS]; + +/* + * Domain context + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct dom_ctx); + __uint(max_entries, MAX_DOMS); + __uint(map_flags, 0); +} dom_data SEC(".maps"); + +struct lock_wrapper { + struct bpf_spin_lock lock; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct lock_wrapper); + __uint(max_entries, MAX_DOMS); + __uint(map_flags, 0); +} dom_load_locks SEC(".maps"); + +struct dom_active_pids { + u64 gen; + u64 read_idx; + u64 write_idx; + s32 pids[MAX_DOM_ACTIVE_PIDS]; +}; + +struct dom_active_pids dom_active_pids[MAX_DOMS]; + +const u64 ravg_1 = 1 << RAVG_FRAC_BITS; + +static void dom_load_adj(u32 dom_id, s64 adj, u64 now) +{ + struct dom_ctx *domc; + struct lock_wrapper *lockw; + + domc = bpf_map_lookup_elem(&dom_data, &dom_id); + lockw = bpf_map_lookup_elem(&dom_load_locks, &dom_id); + + if (!domc || !lockw) { + scx_bpf_error("dom_ctx / lock lookup failed"); + return; + } + + bpf_spin_lock(&lockw->lock); + domc->load += adj; + ravg_accumulate(&domc->load_rd, domc->load, now, load_half_life); + bpf_spin_unlock(&lockw->lock); + + if (adj < 0 && (s64)domc->load < 0) + scx_bpf_error("cpu%d dom%u load underflow (load=%lld adj=%lld)", + bpf_get_smp_processor_id(), dom_id, domc->load, adj); + + if (debug >=2 && + (!domc->dbg_load_printed_at || now - domc->dbg_load_printed_at >= 1000000000)) { + bpf_printk("LOAD ADJ dom=%u adj=%lld load=%llu", + dom_id, + adj, + ravg_read(&domc->load_rd, now, load_half_life) >> RAVG_FRAC_BITS); + domc->dbg_load_printed_at = now; + } +} + +static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc, + u32 from_dom_id, u32 to_dom_id, u64 now) +{ + struct dom_ctx *from_domc, *to_domc; + struct lock_wrapper *from_lockw, *to_lockw; + struct ravg_data task_load_rd; + u64 from_load[2], to_load[2], task_load; + + from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id); + from_lockw = bpf_map_lookup_elem(&dom_load_locks, &from_dom_id); + to_domc = bpf_map_lookup_elem(&dom_data, &to_dom_id); + to_lockw = bpf_map_lookup_elem(&dom_load_locks, &to_dom_id); + if (!from_domc || !from_lockw || !to_domc || !to_lockw) { + scx_bpf_error("dom_ctx / lock lookup failed"); + return; + } + + /* + * @p is moving from @from_dom_id to @to_dom_id. Its load contribution + * should be moved together. We only track duty cycle for tasks. Scale + * it by weight to get load_rd. + */ + ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); + task_load_rd = taskc->dcyc_rd; + ravg_scale(&task_load_rd, p->scx.weight, 0); + + if (debug >= 2) + task_load = ravg_read(&task_load_rd, now, load_half_life); + + /* transfer out of @from_dom_id */ + bpf_spin_lock(&from_lockw->lock); + if (taskc->runnable) + from_domc->load -= p->scx.weight; + + if (debug >= 2) + from_load[0] = ravg_read(&from_domc->load_rd, now, load_half_life); + + ravg_transfer(&from_domc->load_rd, from_domc->load, + &task_load_rd, taskc->runnable, load_half_life, false); + + if (debug >= 2) + from_load[1] = ravg_read(&from_domc->load_rd, now, load_half_life); + + bpf_spin_unlock(&from_lockw->lock); + + /* transfer into @to_dom_id */ + bpf_spin_lock(&to_lockw->lock); + if (taskc->runnable) + to_domc->load += p->scx.weight; + + if (debug >= 2) + to_load[0] = ravg_read(&to_domc->load_rd, now, load_half_life); + + ravg_transfer(&to_domc->load_rd, to_domc->load, + &task_load_rd, taskc->runnable, load_half_life, true); + + if (debug >= 2) + to_load[1] = ravg_read(&to_domc->load_rd, now, load_half_life); + + bpf_spin_unlock(&to_lockw->lock); + + if (debug >= 2) + bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu", + from_dom_id, to_dom_id, + task_load >> RAVG_FRAC_BITS, + from_load[0] >> RAVG_FRAC_BITS, + from_load[1] >> RAVG_FRAC_BITS, + to_load[0] >> RAVG_FRAC_BITS, + to_load[1] >> RAVG_FRAC_BITS); +} + +/* + * Statistics + */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, RUSTY_NR_STATS); +} stats SEC(".maps"); + +static inline void stat_add(enum stat_idx idx, u64 addend) +{ + u32 idx_v = idx; + + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); + if (cnt_p) + (*cnt_p) += addend; +} + +/* Map pid -> task_ctx */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, struct task_ctx); + __uint(max_entries, 1000000); + __uint(map_flags, 0); +} task_data SEC(".maps"); + +struct task_ctx *lookup_task_ctx(struct task_struct *p) +{ + struct task_ctx *taskc; + s32 pid = p->pid; + + if ((taskc = bpf_map_lookup_elem(&task_data, &pid))) { + return taskc; + } else { + scx_bpf_error("task_ctx lookup failed for pid %d", p->pid); + return NULL; + } +} + +/* + * This is populated from userspace to indicate which pids should be reassigned + * to new doms. + */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, u32); + __uint(max_entries, 1000); + __uint(map_flags, 0); +} lb_data SEC(".maps"); + +/* + * Userspace tuner will frequently update the following struct with tuning + * parameters and bump its gen. refresh_tune_params() converts them into forms + * that can be used directly in the scheduling paths. + */ +struct tune_input{ + u64 gen; + u64 direct_greedy_cpumask[MAX_CPUS / 64]; + u64 kick_greedy_cpumask[MAX_CPUS / 64]; +} tune_input; + +u64 tune_params_gen; +private(A) struct bpf_cpumask __kptr *all_cpumask; +private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask; +private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask; + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static u32 cpu_to_dom_id(s32 cpu) +{ + const volatile u32 *dom_idp; + + if (nr_doms <= 1) + return 0; + + dom_idp = MEMBER_VPTR(cpu_dom_id_map, [cpu]); + if (!dom_idp) + return MAX_DOMS; + + return *dom_idp; +} + +static void refresh_tune_params(void) +{ + s32 cpu; + + if (tune_params_gen == tune_input.gen) + return; + + tune_params_gen = tune_input.gen; + + bpf_for(cpu, 0, nr_cpus) { + u32 dom_id = cpu_to_dom_id(cpu); + struct dom_ctx *domc; + + if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) { + scx_bpf_error("Failed to lookup dom[%u]", dom_id); + return; + } + + if (tune_input.direct_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) { + if (direct_greedy_cpumask) + bpf_cpumask_set_cpu(cpu, direct_greedy_cpumask); + if (domc->direct_greedy_cpumask) + bpf_cpumask_set_cpu(cpu, domc->direct_greedy_cpumask); + } else { + if (direct_greedy_cpumask) + bpf_cpumask_clear_cpu(cpu, direct_greedy_cpumask); + if (domc->direct_greedy_cpumask) + bpf_cpumask_clear_cpu(cpu, domc->direct_greedy_cpumask); + } + + if (tune_input.kick_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) { + if (kick_greedy_cpumask) + bpf_cpumask_set_cpu(cpu, kick_greedy_cpumask); + } else { + if (kick_greedy_cpumask) + bpf_cpumask_clear_cpu(cpu, kick_greedy_cpumask); + } + } +} + +static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p, + u32 new_dom_id, bool init_dsq_vtime) +{ + struct dom_ctx *old_domc, *new_domc; + struct bpf_cpumask *d_cpumask, *t_cpumask; + u32 old_dom_id = taskc->dom_id; + s64 vtime_delta; + + old_domc = bpf_map_lookup_elem(&dom_data, &old_dom_id); + if (!old_domc) { + scx_bpf_error("Failed to lookup old dom%u", old_dom_id); + return false; + } + + if (init_dsq_vtime) + vtime_delta = 0; + else + vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now; + + new_domc = bpf_map_lookup_elem(&dom_data, &new_dom_id); + if (!new_domc) { + scx_bpf_error("Failed to lookup new dom%u", new_dom_id); + return false; + } + + d_cpumask = new_domc->cpumask; + if (!d_cpumask) { + scx_bpf_error("Failed to get dom%u cpumask kptr", + new_dom_id); + return false; + } + + t_cpumask = taskc->cpumask; + if (!t_cpumask) { + scx_bpf_error("Failed to look up task cpumask"); + return false; + } + + /* + * set_cpumask might have happened between userspace requesting LB and + * here and @p might not be able to run in @dom_id anymore. Verify. + */ + if (bpf_cpumask_intersects((const struct cpumask *)d_cpumask, + p->cpus_ptr)) { + u64 now = bpf_ktime_get_ns(); + + dom_load_xfer_task(p, taskc, taskc->dom_id, new_dom_id, now); + + p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta; + taskc->dom_id = new_dom_id; + bpf_cpumask_and(t_cpumask, (const struct cpumask *)d_cpumask, + p->cpus_ptr); + } + + return taskc->dom_id == new_dom_id; +} + +s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) +{ + const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask(); + struct task_ctx *taskc; + struct bpf_cpumask *p_cpumask; + bool prev_domestic, has_idle_cores; + s32 cpu; + + refresh_tune_params(); + + if (!(taskc = lookup_task_ctx(p)) || !(p_cpumask = taskc->cpumask)) + goto enoent; + + if (p->nr_cpus_allowed == 1) { + cpu = prev_cpu; + if (kthreads_local && (p->flags & PF_KTHREAD)) { + stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); + } else { + stat_add(RUSTY_STAT_PINNED, 1); + } + goto direct; + } + + /* + * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the + * local dsq of the waker. + */ + if (wake_flags & SCX_WAKE_SYNC) { + struct task_struct *current = (void *)bpf_get_current_task(); + + if (!(BPF_CORE_READ(current, flags) & PF_EXITING) && + taskc->dom_id < MAX_DOMS) { + struct dom_ctx *domc; + struct bpf_cpumask *d_cpumask; + const struct cpumask *idle_cpumask; + bool has_idle; + + domc = bpf_map_lookup_elem(&dom_data, &taskc->dom_id); + if (!domc) { + scx_bpf_error("Failed to find dom%u", taskc->dom_id); + goto enoent; + } + d_cpumask = domc->cpumask; + if (!d_cpumask) { + scx_bpf_error("Failed to acquire dom%u cpumask kptr", + taskc->dom_id); + goto enoent; + } + + idle_cpumask = scx_bpf_get_idle_cpumask(); + + has_idle = bpf_cpumask_intersects((const struct cpumask *)d_cpumask, + idle_cpumask); + + scx_bpf_put_idle_cpumask(idle_cpumask); + + if (has_idle) { + cpu = bpf_get_smp_processor_id(); + if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + stat_add(RUSTY_STAT_WAKE_SYNC, 1); + goto direct; + } + } + } + } + + has_idle_cores = !bpf_cpumask_empty(idle_smtmask); + + /* did @p get pulled out to a foreign domain by e.g. greedy execution? */ + prev_domestic = bpf_cpumask_test_cpu(prev_cpu, + (const struct cpumask *)p_cpumask); + + /* + * See if we want to keep @prev_cpu. We want to keep @prev_cpu if the + * whole physical core is idle. If the sibling[s] are busy, it's likely + * more advantageous to look for wholly idle cores first. + */ + if (prev_domestic) { + if (bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + stat_add(RUSTY_STAT_PREV_IDLE, 1); + cpu = prev_cpu; + goto direct; + } + } else { + /* + * @prev_cpu is foreign. Linger iff the domain isn't too busy as + * indicated by direct_greedy_cpumask. There may also be an idle + * CPU in the domestic domain + */ + if (direct_greedy_cpumask && + bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *) + direct_greedy_cpumask) && + bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + stat_add(RUSTY_STAT_GREEDY_IDLE, 1); + cpu = prev_cpu; + goto direct; + } + } + + /* + * @prev_cpu didn't work out. Let's see whether there's an idle CPU @p + * can be directly dispatched to. We'll first try to find the best idle + * domestic CPU and then move onto foreign. + */ + + /* If there is a domestic idle core, dispatch directly */ + if (has_idle_cores) { + cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); + goto direct; + } + } + + /* + * If @prev_cpu was domestic and is idle itself even though the core + * isn't, picking @prev_cpu may improve L1/2 locality. + */ + if (prev_domestic && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); + cpu = prev_cpu; + goto direct; + } + + /* If there is any domestic idle CPU, dispatch directly */ + cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0); + if (cpu >= 0) { + stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); + goto direct; + } + + /* + * Domestic domain is fully booked. If there are CPUs which are idle and + * under-utilized, ignore domain boundaries and push the task there. Try + * to find an idle core first. + */ + if (taskc->all_cpus && direct_greedy_cpumask && + !bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) { + u32 dom_id = cpu_to_dom_id(prev_cpu); + struct dom_ctx *domc; + + if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) { + scx_bpf_error("Failed to lookup dom[%u]", dom_id); + goto enoent; + } + + /* Try to find an idle core in the previous and then any domain */ + if (has_idle_cores) { + if (domc->direct_greedy_cpumask) { + cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) + domc->direct_greedy_cpumask, + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_add(RUSTY_STAT_DIRECT_GREEDY, 1); + goto direct; + } + } + + if (direct_greedy_cpumask) { + cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) + direct_greedy_cpumask, + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1); + goto direct; + } + } + } + + /* + * No idle core. Is there any idle CPU? + */ + if (domc->direct_greedy_cpumask) { + cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) + domc->direct_greedy_cpumask, 0); + if (cpu >= 0) { + stat_add(RUSTY_STAT_DIRECT_GREEDY, 1); + goto direct; + } + } + + if (direct_greedy_cpumask) { + cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) + direct_greedy_cpumask, 0); + if (cpu >= 0) { + stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1); + goto direct; + } + } + } + + /* + * We're going to queue on the domestic domain's DSQ. @prev_cpu may be + * in a different domain. Returning an out-of-domain CPU can lead to + * stalls as all in-domain CPUs may be idle by the time @p gets + * enqueued. + */ + if (prev_domestic) + cpu = prev_cpu; + else + cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0); + + scx_bpf_put_idle_cpumask(idle_smtmask); + return cpu; + +direct: + taskc->dispatch_local = true; + scx_bpf_put_idle_cpumask(idle_smtmask); + return cpu; + +enoent: + scx_bpf_put_idle_cpumask(idle_smtmask); + return -ENOENT; +} + +void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_ctx *taskc; + struct bpf_cpumask *p_cpumask; + pid_t pid = p->pid; + u32 *new_dom; + s32 cpu; + + if (!(taskc = lookup_task_ctx(p))) + return; + if (!(p_cpumask = taskc->cpumask)) { + scx_bpf_error("NULL cpmask"); + return; + } + + /* + * Migrate @p to a new domain if requested by userland through lb_data. + */ + new_dom = bpf_map_lookup_elem(&lb_data, &pid); + if (new_dom && *new_dom != taskc->dom_id && + task_set_domain(taskc, p, *new_dom, false)) { + stat_add(RUSTY_STAT_LOAD_BALANCE, 1); + taskc->dispatch_local = false; + cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0); + if (cpu >= 0) + scx_bpf_kick_cpu(cpu, 0); + goto dom_queue; + } + + if (taskc->dispatch_local) { + taskc->dispatch_local = false; + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); + return; + } + + /* + * @p is about to be queued on its domain's dsq. However, @p may be on a + * foreign CPU due to a greedy execution and not have gone through + * ->select_cpu() if it's being enqueued e.g. after slice exhaustion. If + * so, @p would be queued on its domain's dsq but none of the CPUs in + * the domain would be woken up which can induce temporary execution + * stalls. Kick a domestic CPU if @p is on a foreign domain. + */ + if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), (const struct cpumask *)p_cpumask)) { + cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0); + scx_bpf_kick_cpu(cpu, 0); + stat_add(RUSTY_STAT_REPATRIATE, 1); + } + +dom_queue: + if (fifo_sched) { + scx_bpf_dispatch(p, taskc->dom_id, slice_ns, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + u32 dom_id = taskc->dom_id; + struct dom_ctx *domc; + + domc = bpf_map_lookup_elem(&dom_data, &dom_id); + if (!domc) { + scx_bpf_error("Failed to lookup dom[%u]", dom_id); + return; + } + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, domc->vtime_now - slice_ns)) + vtime = domc->vtime_now - slice_ns; + + scx_bpf_dispatch_vtime(p, taskc->dom_id, slice_ns, vtime, enq_flags); + } + + /* + * If there are CPUs which are idle and not saturated, wake them up to + * see whether they'd be able to steal the just queued task. This path + * is taken only if DIRECT_GREEDY didn't trigger in select_cpu(). + * + * While both mechanisms serve very similar purposes, DIRECT_GREEDY + * emplaces the task in a foreign CPU directly while KICK_GREEDY just + * wakes up a foreign CPU which will then first try to execute from its + * domestic domain first before snooping foreign ones. + * + * While KICK_GREEDY is a more expensive way of accelerating greedy + * execution, DIRECT_GREEDY shows negative performance impacts when the + * CPUs are highly loaded while KICK_GREEDY doesn't. Even under fairly + * high utilization, KICK_GREEDY can slightly improve work-conservation. + */ + if (taskc->all_cpus && kick_greedy_cpumask) { + cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) + kick_greedy_cpumask, 0); + if (cpu >= 0) { + stat_add(RUSTY_STAT_KICK_GREEDY, 1); + scx_bpf_kick_cpu(cpu, 0); + } + } +} + +static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id) +{ + s32 cpu; + + if (dom_id >= MAX_DOMS) + return false; + + bpf_for(cpu, 0, nr_cpus) { + if (bpf_cpumask_test_cpu(cpu, cpumask) && + (dom_cpumasks[dom_id][cpu / 64] & (1LLU << (cpu % 64)))) + return true; + } + return false; +} + +static u32 dom_rr_next(s32 cpu) +{ + struct pcpu_ctx *pcpuc; + u32 dom_id; + + pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]); + if (!pcpuc) + return 0; + + dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms; + + if (dom_id == cpu_to_dom_id(cpu)) + dom_id = (dom_id + 1) % nr_doms; + + pcpuc->dom_rr_cur = dom_id; + return dom_id; +} + +void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev) +{ + u32 dom = cpu_to_dom_id(cpu); + + if (scx_bpf_consume(dom)) { + stat_add(RUSTY_STAT_DSQ_DISPATCH, 1); + return; + } + + if (!greedy_threshold) + return; + + bpf_repeat(nr_doms - 1) { + u32 dom_id = dom_rr_next(cpu); + + if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold && + scx_bpf_consume(dom_id)) { + stat_add(RUSTY_STAT_GREEDY, 1); + break; + } + } +} + +void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags) +{ + u64 now = bpf_ktime_get_ns(); + struct task_ctx *taskc; + + if (!(taskc = lookup_task_ctx(p))) + return; + + taskc->runnable = true; + taskc->is_kworker = p->flags & PF_WQ_WORKER; + + ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); + dom_load_adj(taskc->dom_id, p->scx.weight, now); +} + +void BPF_STRUCT_OPS(rusty_running, struct task_struct *p) +{ + struct task_ctx *taskc; + struct dom_ctx *domc; + u32 dom_id, dap_gen; + + if (!(taskc = lookup_task_ctx(p))) + return; + + taskc->running_at = bpf_ktime_get_ns(); + dom_id = taskc->dom_id; + if (dom_id >= MAX_DOMS) { + scx_bpf_error("Invalid dom ID"); + return; + } + + /* + * Record that @p has been active in @domc. Load balancer will only + * consider recently active tasks. Access synchronization rules aren't + * strict. We just need to be right most of the time. + */ + dap_gen = dom_active_pids[dom_id].gen; + if (taskc->dom_active_pids_gen != dap_gen) { + u64 idx = __sync_fetch_and_add(&dom_active_pids[dom_id].write_idx, 1) % + MAX_DOM_ACTIVE_PIDS; + s32 *pidp; + + pidp = MEMBER_VPTR(dom_active_pids, [dom_id].pids[idx]); + if (!pidp) { + scx_bpf_error("dom_active_pids[%u][%llu] indexing failed", + dom_id, idx); + return; + } + + *pidp = p->pid; + taskc->dom_active_pids_gen = dap_gen; + } + + if (fifo_sched) + return; + + domc = bpf_map_lookup_elem(&dom_data, &dom_id); + if (!domc) { + scx_bpf_error("Failed to lookup dom[%u]", dom_id); + return; + } + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(domc->vtime_now, p->scx.dsq_vtime)) + domc->vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable) +{ + struct task_ctx *taskc; + + if (fifo_sched) + return; + + if (!(taskc = lookup_task_ctx(p))) + return; + + /* scale the execution time by the inverse of the weight and charge */ + p->scx.dsq_vtime += + (bpf_ktime_get_ns() - taskc->running_at) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags) +{ + u64 now = bpf_ktime_get_ns(); + struct task_ctx *taskc; + + if (!(taskc = lookup_task_ctx(p))) + return; + + taskc->runnable = false; + + ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); + dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now); +} + +void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight) +{ + struct task_ctx *taskc; + + if (!(taskc = lookup_task_ctx(p))) + return; + + taskc->weight = weight; +} + +static u32 task_pick_domain(struct task_ctx *taskc, struct task_struct *p, + const struct cpumask *cpumask) +{ + s32 cpu = bpf_get_smp_processor_id(); + u32 first_dom = MAX_DOMS, dom; + + if (cpu < 0 || cpu >= MAX_CPUS) + return MAX_DOMS; + + taskc->dom_mask = 0; + + dom = pcpu_ctx[cpu].dom_rr_cur++; + bpf_repeat(nr_doms) { + dom = (dom + 1) % nr_doms; + if (cpumask_intersects_domain(cpumask, dom)) { + taskc->dom_mask |= 1LLU << dom; + /* + * AsThe starting point is round-robin'd and the first + * match should be spread across all the domains. + */ + if (first_dom == MAX_DOMS) + first_dom = dom; + } + } + + return first_dom; +} + +static void task_pick_and_set_domain(struct task_ctx *taskc, + struct task_struct *p, + const struct cpumask *cpumask, + bool init_dsq_vtime) +{ + u32 dom_id = 0; + + if (nr_doms > 1) + dom_id = task_pick_domain(taskc, p, cpumask); + + if (!task_set_domain(taskc, p, dom_id, init_dsq_vtime)) + scx_bpf_error("Failed to set dom%d for %s[%d]", + dom_id, p->comm, p->pid); +} + +void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p, + const struct cpumask *cpumask) +{ + struct task_ctx *taskc; + + if (!(taskc = lookup_task_ctx(p))) + return; + + task_pick_and_set_domain(taskc, p, cpumask, false); + if (all_cpumask) + taskc->all_cpus = + bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask); +} + +s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct bpf_cpumask *cpumask; + struct task_ctx taskc = { .dom_active_pids_gen = -1 }; + struct task_ctx *map_value; + long ret; + pid_t pid; + + pid = p->pid; + + /* + * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may + * fail spuriously due to BPF recursion protection triggering + * unnecessarily. + */ + ret = bpf_map_update_elem(&task_data, &pid, &taskc, 0 /*BPF_NOEXIST*/); + if (ret) { + stat_add(RUSTY_STAT_TASK_GET_ERR, 1); + return ret; + } + + /* + * Read the entry from the map immediately so we can add the cpumask + * with bpf_kptr_xchg(). + */ + map_value = bpf_map_lookup_elem(&task_data, &pid); + if (!map_value) + /* Should never happen -- it was just inserted above. */ + return -EINVAL; + + cpumask = bpf_cpumask_create(); + if (!cpumask) { + bpf_map_delete_elem(&task_data, &pid); + return -ENOMEM; + } + + cpumask = bpf_kptr_xchg(&map_value->cpumask, cpumask); + if (cpumask) { + /* Should never happen as we just inserted it above. */ + bpf_cpumask_release(cpumask); + bpf_map_delete_elem(&task_data, &pid); + return -EINVAL; + } + + task_pick_and_set_domain(map_value, p, p->cpus_ptr, true); + + return 0; +} + +void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p, + struct scx_exit_task_args *args) +{ + pid_t pid = p->pid; + long ret; + + /* + * XXX - There's no reason delete should fail here but BPF's recursion + * protection can unnecessarily fail the operation. The fact that + * deletions aren't reliable means that we sometimes leak task_ctx and + * can't use BPF_NOEXIST on allocation in .prep_enable(). + */ + ret = bpf_map_delete_elem(&task_data, &pid); + if (ret) { + stat_add(RUSTY_STAT_TASK_GET_ERR, 1); + return; + } +} + +static s32 create_dom(u32 dom_id) +{ + struct dom_ctx domc_init = {}, *domc; + struct bpf_cpumask *cpumask; + u32 cpu; + s32 ret; + + ret = scx_bpf_create_dsq(dom_id, -1); + if (ret < 0) { + scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret); + return ret; + } + + ret = bpf_map_update_elem(&dom_data, &dom_id, &domc_init, 0); + if (ret) { + scx_bpf_error("Failed to add dom_ctx entry %u (%d)", dom_id, ret); + return ret; + } + + domc = bpf_map_lookup_elem(&dom_data, &dom_id); + if (!domc) { + /* Should never happen, we just inserted it above. */ + scx_bpf_error("No dom%u", dom_id); + return -ENOENT; + } + + cpumask = bpf_cpumask_create(); + if (!cpumask) { + scx_bpf_error("Failed to create BPF cpumask for domain %u", dom_id); + return -ENOMEM; + } + + for (cpu = 0; cpu < MAX_CPUS; cpu++) { + const volatile u64 *dmask; + + dmask = MEMBER_VPTR(dom_cpumasks, [dom_id][cpu / 64]); + if (!dmask) { + scx_bpf_error("array index error"); + bpf_cpumask_release(cpumask); + return -ENOENT; + } + + if (*dmask & (1LLU << (cpu % 64))) { + bpf_cpumask_set_cpu(cpu, cpumask); + + bpf_rcu_read_lock(); + if (all_cpumask) + bpf_cpumask_set_cpu(cpu, all_cpumask); + bpf_rcu_read_unlock(); + } + } + + cpumask = bpf_kptr_xchg(&domc->cpumask, cpumask); + if (cpumask) { + scx_bpf_error("Domain %u cpumask already present", dom_id); + bpf_cpumask_release(cpumask); + return -EEXIST; + } + + cpumask = bpf_cpumask_create(); + if (!cpumask) { + scx_bpf_error("Failed to create BPF cpumask for domain %u", + dom_id); + return -ENOMEM; + } + + cpumask = bpf_kptr_xchg(&domc->direct_greedy_cpumask, cpumask); + if (cpumask) { + scx_bpf_error("Domain %u direct_greedy_cpumask already present", + dom_id); + bpf_cpumask_release(cpumask); + return -EEXIST; + } + + return 0; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init) +{ + struct bpf_cpumask *cpumask; + s32 i, ret; + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + cpumask = bpf_kptr_xchg(&direct_greedy_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + cpumask = bpf_cpumask_create(); + if (!cpumask) + return -ENOMEM; + cpumask = bpf_kptr_xchg(&kick_greedy_cpumask, cpumask); + if (cpumask) + bpf_cpumask_release(cpumask); + + if (!switch_partial) + scx_bpf_switch_all(); + + bpf_for(i, 0, nr_doms) { + ret = create_dom(i); + if (ret) + return ret; + } + + bpf_for(i, 0, nr_cpus) + pcpu_ctx[i].dom_rr_cur = i; + + return 0; +} + +void BPF_STRUCT_OPS(rusty_exit, struct scx_exit_info *ei) +{ + bpf_probe_read_kernel_str(exit_msg, sizeof(exit_msg), ei->msg); + exit_kind = ei->kind; +} + +SEC(".struct_ops.link") +struct sched_ext_ops rusty = { + .select_cpu = (void *)rusty_select_cpu, + .enqueue = (void *)rusty_enqueue, + .dispatch = (void *)rusty_dispatch, + .runnable = (void *)rusty_runnable, + .running = (void *)rusty_running, + .stopping = (void *)rusty_stopping, + .quiescent = (void *)rusty_quiescent, + .set_weight = (void *)rusty_set_weight, + .set_cpumask = (void *)rusty_set_cpumask, + .init_task = (void *)rusty_init_task, + .exit_task = (void *)rusty_exit_task, + .init = (void *)rusty_init, + .exit = (void *)rusty_exit, + .name = "rusty", +}; diff --git a/tools/sched_ext/scx_rusty/src/bpf_intf.rs b/tools/sched_ext/scx_rusty/src/bpf_intf.rs new file mode 100644 index 0000000000000..0ed31f8e08738 --- /dev/null +++ b/tools/sched_ext/scx_rusty/src/bpf_intf.rs @@ -0,0 +1,10 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(dead_code)] + +include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs")); diff --git a/tools/sched_ext/scx_rusty/src/bpf_skel.rs b/tools/sched_ext/scx_rusty/src/bpf_skel.rs new file mode 100644 index 0000000000000..063ccf896d61e --- /dev/null +++ b/tools/sched_ext/scx_rusty/src/bpf_skel.rs @@ -0,0 +1,12 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. + +// We can't directly include the generated skeleton in main.rs as it may +// contain compiler attributes that can't be `include!()`ed via macro and we +// can't use the `#[path = "..."]` because `concat!(env!("OUT_DIR"), +// "/bpf.skel.rs")` does not work inside the path attribute yet (see +// https://github.com/rust-lang/rust/pull/83366). + +include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs")); diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs new file mode 100644 index 0000000000000..3192ee049f9f2 --- /dev/null +++ b/tools/sched_ext/scx_rusty/src/main.rs @@ -0,0 +1,1271 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. + +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2. +mod bpf_skel; +pub use bpf_skel::*; +pub mod bpf_intf; + +use std::cell::Cell; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::ffi::CStr; +use std::ops::Bound::Included; +use std::ops::Bound::Unbounded; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::time::Duration; +use std::time::Instant; + +use ::fb_procfs as procfs; +use anyhow::anyhow; +use anyhow::bail; +use anyhow::Context; +use anyhow::Result; +use bitvec::prelude::*; +use clap::Parser; +use libbpf_rs::skel::OpenSkel as _; +use libbpf_rs::skel::Skel as _; +use libbpf_rs::skel::SkelBuilder as _; +use log::debug; +use log::info; +use log::trace; +use log::warn; +use ordered_float::OrderedFloat; +use scx_utils::ravg::ravg_read; + +const RAVG_FRAC_BITS: u32 = bpf_intf::ravg_consts_RAVG_FRAC_BITS; +const MAX_DOMS: usize = bpf_intf::consts_MAX_DOMS as usize; +const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize; + +/// scx_rusty: A multi-domain BPF / userspace hybrid scheduler +/// +/// The BPF part does simple vtime or round robin scheduling in each domain +/// while tracking average load of each domain and duty cycle of each task. +/// +/// The userspace part performs two roles. First, it makes higher frequency +/// (100ms) tuning decisions. It identifies CPUs which are not too heavily +/// loaded and mark them so that they can pull tasks from other overloaded +/// domains on the fly. +/// +/// Second, it drives lower frequency (2s) load balancing. It determines +/// whether load balancing is necessary by comparing domain load averages. +/// If there are large enough load differences, it examines upto 1024 +/// recently active tasks on the domain to determine which should be +/// migrated. +/// +/// The overhead of userspace operations is low. Load balancing is not +/// performed frequently but work-conservation is still maintained through +/// tuning and greedy execution. Load balancing itself is not that expensive +/// either. It only accesses per-domain load metrics to determine the +/// domains that need load balancing and limited number of per-task metrics +/// for each pushing domain. +/// +/// An earlier variant of this scheduler was used to balance across six +/// domains, each representing a chiplet in a six-chiplet AMD processor, and +/// could match the performance of production setup using CFS. +/// +/// WARNING: Very high weight (low nice value) tasks can throw off load +/// balancing due to infeasible weight problem. This problem will be solved +/// in the near future. +/// +/// WARNING: scx_rusty currently assumes that all domains have equal +/// processing power and at similar distances from each other. This +/// limitation will be removed in the future. +#[derive(Debug, Parser)] +struct Opts { + /// Scheduling slice duration in microseconds. + #[clap(short = 's', long, default_value = "20000")] + slice_us: u64, + + /// Monitoring and load balance interval in seconds. + #[clap(short = 'i', long, default_value = "2.0")] + interval: f64, + + /// Tuner runs at higher frequency than the load balancer to dynamically + /// tune scheduling behavior. Tuning interval in seconds. + #[clap(short = 'I', long, default_value = "0.1")] + tune_interval: f64, + + /// The half-life of task and domain load running averages in seconds. + #[clap(short = 'l', long, default_value = "1.0")] + load_half_life: f64, + + /// Build domains according to how CPUs are grouped at this cache level + /// as determined by /sys/devices/system/cpu/cpuX/cache/indexI/id. + #[clap(short = 'c', long, default_value = "3")] + cache_level: u32, + + /// Instead of using cache locality, set the cpumask for each domain + /// manually, provide multiple --cpumasks, one for each domain. E.g. + /// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains with + /// the corresponding CPUs belonging to each domain. Each CPU must + /// belong to precisely one domain. + #[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")] + cpumasks: Vec, + + /// When non-zero, enable greedy task stealing. When a domain is idle, a + /// cpu will attempt to steal tasks from a domain with at least + /// greedy_threshold tasks enqueued. These tasks aren't permanently + /// stolen from the domain. + #[clap(short = 'g', long, default_value = "1")] + greedy_threshold: u32, + + /// Disable load balancing. Unless disabled, periodically userspace will + /// calculate the load factor of each domain and instruct BPF which + /// processes to move. + #[clap(long, action = clap::ArgAction::SetTrue)] + no_load_balance: bool, + + /// Put per-cpu kthreads directly into local dsq's. + #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)] + kthreads_local: bool, + + /// In recent kernels (>=v6.6), the kernel is responsible for balancing + /// kworkers across L3 cache domains. Exclude them from load-balancing + /// to avoid conflicting operations. Greedy executions still apply. + #[clap(short = 'b', long, action = clap::ArgAction::SetTrue)] + balanced_kworkers: bool, + + /// Use FIFO scheduling instead of weighted vtime scheduling. + #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)] + fifo_sched: bool, + + /// Idle CPUs with utilization lower than this will get remote tasks + /// directly pushed on them. 0 disables, 100 enables always. + #[clap(short = 'D', long, default_value = "90.0")] + direct_greedy_under: f64, + + /// Idle CPUs with utilization lower than this may get kicked to + /// accelerate stealing when a task is queued on a saturated remote + /// domain. 0 disables, 100 enables always. + #[clap(short = 'K', long, default_value = "100.0")] + kick_greedy_under: f64, + + /// If specified, only tasks which have their scheduling policy set to + /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all + /// tasks are switched. + #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)] + partial: bool, + + /// Enable verbose output including libbpf details. Specify multiple + /// times to increase verbosity. + #[clap(short = 'v', long, action = clap::ArgAction::Count)] + verbose: u8, +} + +fn now_monotonic() -> u64 { + let mut time = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) }; + assert!(ret == 0); + time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64 +} + +fn clear_map(map: &libbpf_rs::Map) { + for key in map.keys() { + let _ = map.delete(&key); + } +} + +fn format_cpumask(cpumask: &[u64], nr_cpus: usize) -> String { + cpumask + .iter() + .take((nr_cpus + 64) / 64) + .rev() + .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x)) +} + +fn read_total_cpu(reader: &procfs::ProcReader) -> Result { + reader + .read_stat() + .context("Failed to read procfs")? + .total_cpu + .ok_or_else(|| anyhow!("Could not read total cpu stat in proc")) +} + +fn sub_or_zero(curr: &u64, prev: &u64) -> u64 +{ + if let Some(res) = curr.checked_sub(*prev) { + res + } else { + 0 + } +} + +fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result { + match (curr, prev) { + ( + procfs::CpuStat { + user_usec: Some(curr_user), + nice_usec: Some(curr_nice), + system_usec: Some(curr_system), + idle_usec: Some(curr_idle), + iowait_usec: Some(curr_iowait), + irq_usec: Some(curr_irq), + softirq_usec: Some(curr_softirq), + stolen_usec: Some(curr_stolen), + .. + }, + procfs::CpuStat { + user_usec: Some(prev_user), + nice_usec: Some(prev_nice), + system_usec: Some(prev_system), + idle_usec: Some(prev_idle), + iowait_usec: Some(prev_iowait), + irq_usec: Some(prev_irq), + softirq_usec: Some(prev_softirq), + stolen_usec: Some(prev_stolen), + .. + }, + ) => { + let idle_usec = sub_or_zero(curr_idle, prev_idle); + let iowait_usec = sub_or_zero(curr_iowait, prev_iowait); + let user_usec = sub_or_zero(curr_user, prev_user); + let system_usec = sub_or_zero(curr_system, prev_system); + let nice_usec = sub_or_zero(curr_nice, prev_nice); + let irq_usec = sub_or_zero(curr_irq, prev_irq); + let softirq_usec = sub_or_zero(curr_softirq, prev_softirq); + let stolen_usec = sub_or_zero(curr_stolen, prev_stolen); + + let busy_usec = + user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; + let total_usec = idle_usec + busy_usec + iowait_usec; + if total_usec > 0 { + Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0)) + } else { + Ok(1.0) + } + } + _ => { + bail!("Missing stats in cpustat"); + } + } +} + +#[derive(Debug)] +struct Topology { + nr_cpus: usize, + nr_doms: usize, + dom_cpus: Vec>, + cpu_dom: Vec>, +} + +impl Topology { + fn from_cpumasks(cpumasks: &[String], nr_cpus: usize) -> Result { + if cpumasks.len() > MAX_DOMS { + bail!( + "Number of requested domains ({}) is greater than MAX_DOMS ({})", + cpumasks.len(), + MAX_DOMS + ); + } + let mut cpu_dom = vec![None; nr_cpus]; + let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; cpumasks.len()]; + for (dom, cpumask) in cpumasks.iter().enumerate() { + let hex_str = { + let mut tmp_str = cpumask + .strip_prefix("0x") + .unwrap_or(cpumask) + .replace('_', ""); + if tmp_str.len() % 2 != 0 { + tmp_str = "0".to_string() + &tmp_str; + } + tmp_str + }; + let byte_vec = hex::decode(&hex_str) + .with_context(|| format!("Failed to parse cpumask: {}", cpumask))?; + + for (index, &val) in byte_vec.iter().rev().enumerate() { + let mut v = val; + while v != 0 { + let lsb = v.trailing_zeros() as usize; + v &= !(1 << lsb); + let cpu = index * 8 + lsb; + if cpu > nr_cpus { + bail!( + concat!( + "Found cpu ({}) in cpumask ({}) which is larger", + " than the number of cpus on the machine ({})" + ), + cpu, + cpumask, + nr_cpus + ); + } + if let Some(other_dom) = cpu_dom[cpu] { + bail!( + "Found cpu ({}) with domain ({}) but also in cpumask ({})", + cpu, + other_dom, + cpumask + ); + } + cpu_dom[cpu] = Some(dom); + dom_cpus[dom].set(cpu, true); + } + } + dom_cpus[dom].set_uninitialized(false); + } + + for (cpu, dom) in cpu_dom.iter().enumerate() { + if dom.is_none() { + bail!( + "CPU {} not assigned to any domain. Make sure it is covered by some --cpumasks argument.", + cpu + ); + } + } + + Ok(Self { + nr_cpus, + nr_doms: dom_cpus.len(), + dom_cpus, + cpu_dom, + }) + } + + fn from_cache_level(level: u32, nr_cpus: usize) -> Result { + let mut cpu_to_cache = vec![]; // (cpu_id, Option) + let mut cache_ids = BTreeSet::::new(); + let mut nr_offline = 0; + + // Build cpu -> cache ID mapping. + for cpu in 0..nr_cpus { + let path = format!("/sys/devices/system/cpu/cpu{}/cache/index{}/id", cpu, level); + let id = match std::fs::read_to_string(&path) { + Ok(val) => Some(val.trim().parse::().with_context(|| { + format!("Failed to parse {:?}'s content {:?}", &path, &val) + })?), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + nr_offline += 1; + None + } + Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)), + }; + + cpu_to_cache.push(id); + if let Some(id) = id { + cache_ids.insert(id); + } + } + + info!( + "CPUs: online/possible = {}/{}", + nr_cpus - nr_offline, + nr_cpus + ); + + // Cache IDs may have holes. Assign consecutive domain IDs to + // existing cache IDs. + let mut cache_to_dom = BTreeMap::::new(); + let mut nr_doms = 0; + for cache_id in cache_ids.iter() { + cache_to_dom.insert(*cache_id, nr_doms); + nr_doms += 1; + } + + if nr_doms > MAX_DOMS { + bail!( + "Total number of doms {} is greater than MAX_DOMS ({})", + nr_doms, + MAX_DOMS + ); + } + + // Build and return dom -> cpumask and cpu -> dom mappings. + let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; nr_doms]; + let mut cpu_dom = vec![]; + + for (cpu, cache) in cpu_to_cache.iter().enumerate().take(nr_cpus) { + match cache { + Some(cache_id) => { + let dom_id = cache_to_dom[cache_id]; + dom_cpus[dom_id].set(cpu, true); + cpu_dom.push(Some(dom_id)); + } + None => { + dom_cpus[0].set(cpu, true); + cpu_dom.push(None); + } + } + } + + Ok(Self { + nr_cpus, + nr_doms: dom_cpus.len(), + dom_cpus, + cpu_dom, + }) + } +} + +struct Tuner { + top: Arc, + direct_greedy_under: f64, + kick_greedy_under: f64, + proc_reader: procfs::ProcReader, + prev_cpu_stats: BTreeMap, + dom_utils: Vec, +} + +impl Tuner { + fn new(top: Arc, opts: &Opts) -> Result { + let proc_reader = procfs::ProcReader::new(); + let prev_cpu_stats = proc_reader + .read_stat()? + .cpus_map + .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?; + Ok(Self { + direct_greedy_under: opts.direct_greedy_under / 100.0, + kick_greedy_under: opts.kick_greedy_under / 100.0, + proc_reader, + prev_cpu_stats, + dom_utils: vec![0.0; top.nr_doms], + top, + }) + } + + fn step(&mut self, skel: &mut BpfSkel) -> Result<()> { + let curr_cpu_stats = self + .proc_reader + .read_stat()? + .cpus_map + .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?; + let ti = &mut skel.bss_mut().tune_input; + let mut dom_nr_cpus = vec![0; self.top.nr_doms]; + let mut dom_util_sum = vec![0.0; self.top.nr_doms]; + + for cpu in 0..self.top.nr_cpus { + let cpu32 = cpu as u32; + // None domain indicates the CPU was offline during + // initialization and None CpuStat indicates the CPU has gone + // down since then. Ignore both. + if let (Some(dom), Some(curr), Some(prev)) = ( + self.top.cpu_dom[cpu], + curr_cpu_stats.get(&cpu32), + self.prev_cpu_stats.get(&cpu32), + ) { + dom_nr_cpus[dom] += 1; + dom_util_sum[dom] += calc_util(curr, prev)?; + } + } + + for dom in 0..self.top.nr_doms { + // Calculate the domain avg util. If there are no active CPUs, + // it doesn't really matter. Go with 0.0 as that's less likely + // to confuse users. + let util = match dom_nr_cpus[dom] { + 0 => 0.0, + nr => dom_util_sum[dom] / nr as f64, + }; + + self.dom_utils[dom] = util; + + // This could be implemented better. + let update_dom_bits = |target: &mut [u64; 8], val: bool| { + for cpu in 0..self.top.nr_cpus { + if let Some(cdom) = self.top.cpu_dom[cpu] { + if cdom == dom { + if val { + target[cpu / 64] |= 1u64 << (cpu % 64); + } else { + target[cpu / 64] &= !(1u64 << (cpu % 64)); + } + } + } + } + }; + + update_dom_bits( + &mut ti.direct_greedy_cpumask, + self.direct_greedy_under > 0.99999 || util < self.direct_greedy_under, + ); + update_dom_bits( + &mut ti.kick_greedy_cpumask, + self.kick_greedy_under > 0.99999 || util < self.kick_greedy_under, + ); + } + + ti.gen += 1; + self.prev_cpu_stats = curr_cpu_stats; + Ok(()) + } +} + +#[derive(Debug)] +struct TaskInfo { + pid: i32, + dom_mask: u64, + migrated: Cell, + is_kworker: bool, +} + +struct LoadBalancer<'a, 'b, 'c> { + skel: &'a mut BpfSkel<'b>, + top: Arc, + skip_kworkers: bool, + + tasks_by_load: Vec, TaskInfo>>>, + load_avg: f64, + dom_loads: Vec, + + imbal: Vec, + doms_to_push: BTreeMap, u32>, + doms_to_pull: BTreeMap, u32>, + + nr_lb_data_errors: &'c mut u64, +} + +impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> { + // If imbalance gets higher than this ratio, try to balance the loads. + const LOAD_IMBAL_HIGH_RATIO: f64 = 0.10; + + // Aim to transfer this fraction of the imbalance on each round. We want + // to be gradual to avoid unnecessary oscillations. While this can delay + // convergence, greedy execution should be able to bridge the temporary + // gap. + const LOAD_IMBAL_XFER_TARGET_RATIO: f64 = 0.50; + + // Don't push out more than this ratio of load on each round. While this + // overlaps with XFER_TARGET_RATIO, XFER_TARGET_RATIO only defines the + // target and doesn't limit the total load. As long as the transfer + // reduces load imbalance between the two involved domains, it'd happily + // transfer whatever amount that can be transferred. This limit is used + // as the safety cap to avoid draining a given domain too much in a + // single round. + const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50; + + fn new( + skel: &'a mut BpfSkel<'b>, + top: Arc, + skip_kworkers: bool, + nr_lb_data_errors: &'c mut u64, + ) -> Self { + Self { + skel, + skip_kworkers, + + tasks_by_load: (0..top.nr_doms).map(|_| None).collect(), + load_avg: 0f64, + dom_loads: vec![0.0; top.nr_doms], + + imbal: vec![0.0; top.nr_doms], + doms_to_pull: BTreeMap::new(), + doms_to_push: BTreeMap::new(), + + nr_lb_data_errors, + + top, + } + } + + fn read_dom_loads(&mut self) -> Result<()> { + let now_mono = now_monotonic(); + let load_half_life = self.skel.rodata().load_half_life; + let maps = self.skel.maps(); + let dom_data = maps.dom_data(); + let mut load_sum = 0.0f64; + + for i in 0..self.top.nr_doms { + let key = unsafe { std::mem::transmute::(i as u32) }; + + if let Some(dom_ctx_map_elem) = dom_data + .lookup(&key, libbpf_rs::MapFlags::ANY) + .context("Failed to lookup dom_ctx")? + { + let dom_ctx = + unsafe { &*(dom_ctx_map_elem.as_slice().as_ptr() as *const bpf_intf::dom_ctx) }; + + let rd = &dom_ctx.load_rd; + self.dom_loads[i] = ravg_read( + rd.val, + rd.val_at, + rd.old, + rd.cur, + now_mono, + load_half_life, + RAVG_FRAC_BITS, + ); + + load_sum += self.dom_loads[i]; + } + } + + self.load_avg = load_sum / self.top.nr_doms as f64; + + Ok(()) + } + + /// To balance dom loads, identify doms with lower and higher load than + /// average. + fn calculate_dom_load_balance(&mut self) -> Result<()> { + for (dom, dom_load) in self.dom_loads.iter().enumerate() { + let imbal = dom_load - self.load_avg; + if imbal.abs() >= self.load_avg * Self::LOAD_IMBAL_HIGH_RATIO { + if imbal > 0f64 { + self.doms_to_push.insert(OrderedFloat(imbal), dom as u32); + } else { + self.doms_to_pull.insert(OrderedFloat(-imbal), dom as u32); + } + self.imbal[dom] = imbal; + } + } + Ok(()) + } + + /// @dom needs to push out tasks to balance loads. Make sure its + /// tasks_by_load is populated so that the victim tasks can be picked. + fn populate_tasks_by_load(&mut self, dom: u32) -> Result<()> { + if self.tasks_by_load[dom as usize].is_some() { + return Ok(()); + } + + // Read active_pids and update write_idx and gen. + // + // XXX - We can't read task_ctx inline because self.skel.bss() + // borrows mutably and thus conflicts with self.skel.maps(). + const MAX_PIDS: u64 = bpf_intf::consts_MAX_DOM_ACTIVE_PIDS as u64; + let active_pids = &mut self.skel.bss_mut().dom_active_pids[dom as usize]; + let mut pids = vec![]; + + let (mut ridx, widx) = (active_pids.read_idx, active_pids.write_idx); + if widx - ridx > MAX_PIDS { + ridx = widx - MAX_PIDS; + } + + for idx in ridx..widx { + let pid = active_pids.pids[(idx % MAX_PIDS) as usize]; + pids.push(pid); + } + + active_pids.read_idx = active_pids.write_idx; + active_pids.gen += 1; + + // Read task_ctx and load. + let load_half_life = self.skel.rodata().load_half_life; + let maps = self.skel.maps(); + let task_data = maps.task_data(); + let now_mono = now_monotonic(); + let mut tasks_by_load = BTreeMap::new(); + + for pid in pids.iter() { + let key = unsafe { std::mem::transmute::(*pid) }; + + if let Some(task_data_elem) = task_data.lookup(&key, libbpf_rs::MapFlags::ANY)? { + let task_ctx = + unsafe { &*(task_data_elem.as_slice().as_ptr() as *const bpf_intf::task_ctx) }; + + if task_ctx.dom_id != dom { + continue; + } + + let rd = &task_ctx.dcyc_rd; + let load = task_ctx.weight as f64 + * ravg_read( + rd.val, + rd.val_at, + rd.old, + rd.cur, + now_mono, + load_half_life, + RAVG_FRAC_BITS, + ); + + tasks_by_load.insert( + OrderedFloat(load), + TaskInfo { + pid: *pid, + dom_mask: task_ctx.dom_mask, + migrated: Cell::new(false), + is_kworker: task_ctx.is_kworker, + }, + ); + } + } + + debug!( + "DOM[{:02}] read load for {} tasks", + dom, + &tasks_by_load.len(), + ); + trace!("DOM[{:02}] tasks_by_load={:?}", dom, &tasks_by_load); + + self.tasks_by_load[dom as usize] = Some(tasks_by_load); + Ok(()) + } + + // Find the first candidate pid which hasn't already been migrated and + // can run in @pull_dom. + fn find_first_candidate<'d, I>( + tasks_by_load: I, + pull_dom: u32, + skip_kworkers: bool, + ) -> Option<(f64, &'d TaskInfo)> + where + I: IntoIterator, &'d TaskInfo)>, + { + match tasks_by_load + .into_iter() + .skip_while(|(_, task)| { + task.migrated.get() + || (task.dom_mask & (1 << pull_dom) == 0) + || (skip_kworkers && task.is_kworker) + }) + .next() + { + Some((OrderedFloat(load), task)) => Some((*load, task)), + None => None, + } + } + + fn pick_victim( + &mut self, + (push_dom, to_push): (u32, f64), + (pull_dom, to_pull): (u32, f64), + ) -> Result> { + let to_xfer = to_pull.min(to_push) * Self::LOAD_IMBAL_XFER_TARGET_RATIO; + + debug!( + "considering dom {}@{:.2} -> {}@{:.2}", + push_dom, to_push, pull_dom, to_pull + ); + + let calc_new_imbal = |xfer: f64| (to_push - xfer).abs() + (to_pull - xfer).abs(); + + self.populate_tasks_by_load(push_dom)?; + + // We want to pick a task to transfer from push_dom to pull_dom to + // reduce the load imbalance between the two closest to $to_xfer. + // IOW, pick a task which has the closest load value to $to_xfer + // that can be migrated. Find such task by locating the first + // migratable task while scanning left from $to_xfer and the + // counterpart while scanning right and picking the better of the + // two. + let (load, task, new_imbal) = match ( + Self::find_first_candidate( + self.tasks_by_load[push_dom as usize] + .as_ref() + .unwrap() + .range((Unbounded, Included(&OrderedFloat(to_xfer)))) + .rev(), + pull_dom, + self.skip_kworkers, + ), + Self::find_first_candidate( + self.tasks_by_load[push_dom as usize] + .as_ref() + .unwrap() + .range((Included(&OrderedFloat(to_xfer)), Unbounded)), + pull_dom, + self.skip_kworkers, + ), + ) { + (None, None) => return Ok(None), + (Some((load, task)), None) | (None, Some((load, task))) => { + (load, task, calc_new_imbal(load)) + } + (Some((load0, task0)), Some((load1, task1))) => { + let (new_imbal0, new_imbal1) = (calc_new_imbal(load0), calc_new_imbal(load1)); + if new_imbal0 <= new_imbal1 { + (load0, task0, new_imbal0) + } else { + (load1, task1, new_imbal1) + } + } + }; + + // If the best candidate can't reduce the imbalance, there's nothing + // to do for this pair. + let old_imbal = to_push + to_pull; + if old_imbal < new_imbal { + debug!( + "skipping pid {}, dom {} -> {} won't improve imbal {:.2} -> {:.2}", + task.pid, push_dom, pull_dom, old_imbal, new_imbal + ); + return Ok(None); + } + + debug!( + "migrating pid {}, dom {} -> {}, imbal={:.2} -> {:.2}", + task.pid, push_dom, pull_dom, old_imbal, new_imbal, + ); + + Ok(Some((task, load))) + } + + // Actually execute the load balancing. Concretely this writes pid -> dom + // entries into the lb_data map for bpf side to consume. + fn load_balance(&mut self) -> Result<()> { + clear_map(self.skel.maps().lb_data()); + + debug!("imbal={:?}", &self.imbal); + debug!("doms_to_push={:?}", &self.doms_to_push); + debug!("doms_to_pull={:?}", &self.doms_to_pull); + + // Push from the most imbalanced to least. + while let Some((OrderedFloat(mut to_push), push_dom)) = self.doms_to_push.pop_last() { + let push_max = self.dom_loads[push_dom as usize] * Self::LOAD_IMBAL_PUSH_MAX_RATIO; + let mut pushed = 0f64; + + // Transfer tasks from push_dom to reduce imbalance. + loop { + let last_pushed = pushed; + + // Pull from the most imbalaned to least. + let mut doms_to_pull = BTreeMap::<_, _>::new(); + std::mem::swap(&mut self.doms_to_pull, &mut doms_to_pull); + let mut pull_doms = doms_to_pull.into_iter().rev().collect::>(); + + for (to_pull, pull_dom) in pull_doms.iter_mut() { + if let Some((task, load)) = + self.pick_victim((push_dom, to_push), (*pull_dom, f64::from(*to_pull)))? + { + // Execute migration. + task.migrated.set(true); + to_push -= load; + *to_pull -= load; + pushed += load; + + // Ask BPF code to execute the migration. + let pid = task.pid; + let cpid = (pid as libc::pid_t).to_ne_bytes(); + if let Err(e) = self.skel.maps_mut().lb_data().update( + &cpid, + &pull_dom.to_ne_bytes(), + libbpf_rs::MapFlags::NO_EXIST, + ) { + warn!( + "Failed to update lb_data map for pid={} error={:?}", + pid, &e + ); + *self.nr_lb_data_errors += 1; + } + + // Always break after a successful migration so that + // the pulling domains are always considered in the + // descending imbalance order. + break; + } + } + + pull_doms + .into_iter() + .map(|(k, v)| self.doms_to_pull.insert(k, v)) + .count(); + + // Stop repeating if nothing got transferred or pushed enough. + if pushed == last_pushed || pushed >= push_max { + break; + } + } + } + Ok(()) + } +} + +struct Scheduler<'a> { + skel: BpfSkel<'a>, + struct_ops: Option, + + sched_interval: Duration, + tune_interval: Duration, + balance_load: bool, + balanced_kworkers: bool, + + top: Arc, + proc_reader: procfs::ProcReader, + + prev_at: Instant, + prev_total_cpu: procfs::CpuStat, + + nr_lb_data_errors: u64, + + tuner: Tuner, +} + +impl<'a> Scheduler<'a> { + fn init(opts: &Opts) -> Result { + // Open the BPF prog first for verification. + let mut skel_builder = BpfSkelBuilder::default(); + skel_builder.obj_builder.debug(opts.verbose > 0); + let mut skel = skel_builder.open().context("Failed to open BPF program")?; + + let nr_cpus = libbpf_rs::num_possible_cpus().unwrap(); + if nr_cpus > MAX_CPUS { + bail!( + "nr_cpus ({}) is greater than MAX_CPUS ({})", + nr_cpus, + MAX_CPUS + ); + } + + // Initialize skel according to @opts. + let top = Arc::new(if !opts.cpumasks.is_empty() { + Topology::from_cpumasks(&opts.cpumasks, nr_cpus)? + } else { + Topology::from_cache_level(opts.cache_level, nr_cpus)? + }); + + skel.rodata_mut().nr_doms = top.nr_doms as u32; + skel.rodata_mut().nr_cpus = top.nr_cpus as u32; + + for (cpu, dom) in top.cpu_dom.iter().enumerate() { + skel.rodata_mut().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32; + } + + for (dom, cpus) in top.dom_cpus.iter().enumerate() { + let raw_cpus_slice = cpus.as_raw_slice(); + let dom_cpumask_slice = &mut skel.rodata_mut().dom_cpumasks[dom]; + let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpus_slice.len()); + left.clone_from_slice(cpus.as_raw_slice()); + info!( + "DOM[{:02}] cpumask{} ({} cpus)", + dom, + &format_cpumask(dom_cpumask_slice, nr_cpus), + cpus.count_ones() + ); + } + + skel.rodata_mut().slice_ns = opts.slice_us * 1000; + skel.rodata_mut().load_half_life = (opts.load_half_life * 1000000000.0) as u32; + skel.rodata_mut().kthreads_local = opts.kthreads_local; + skel.rodata_mut().fifo_sched = opts.fifo_sched; + skel.rodata_mut().switch_partial = opts.partial; + skel.rodata_mut().greedy_threshold = opts.greedy_threshold; + skel.rodata_mut().debug = opts.verbose as u32; + + // Attach. + let mut skel = skel.load().context("Failed to load BPF program")?; + skel.attach().context("Failed to attach BPF program")?; + let struct_ops = Some( + skel.maps_mut() + .rusty() + .attach_struct_ops() + .context("Failed to attach rusty struct ops")?, + ); + info!("Rusty Scheduler Attached"); + + // Other stuff. + let proc_reader = procfs::ProcReader::new(); + let prev_total_cpu = read_total_cpu(&proc_reader)?; + + Ok(Self { + skel, + struct_ops, // should be held to keep it attached + + sched_interval: Duration::from_secs_f64(opts.interval), + tune_interval: Duration::from_secs_f64(opts.tune_interval), + balance_load: !opts.no_load_balance, + balanced_kworkers: opts.balanced_kworkers, + + top: top.clone(), + proc_reader, + + prev_at: Instant::now(), + prev_total_cpu, + + nr_lb_data_errors: 0, + + tuner: Tuner::new(top, opts)?, + }) + } + + fn get_cpu_busy(&mut self) -> Result { + let total_cpu = read_total_cpu(&self.proc_reader)?; + let busy = match (&self.prev_total_cpu, &total_cpu) { + ( + procfs::CpuStat { + user_usec: Some(prev_user), + nice_usec: Some(prev_nice), + system_usec: Some(prev_system), + idle_usec: Some(prev_idle), + iowait_usec: Some(prev_iowait), + irq_usec: Some(prev_irq), + softirq_usec: Some(prev_softirq), + stolen_usec: Some(prev_stolen), + guest_usec: _, + guest_nice_usec: _, + }, + procfs::CpuStat { + user_usec: Some(curr_user), + nice_usec: Some(curr_nice), + system_usec: Some(curr_system), + idle_usec: Some(curr_idle), + iowait_usec: Some(curr_iowait), + irq_usec: Some(curr_irq), + softirq_usec: Some(curr_softirq), + stolen_usec: Some(curr_stolen), + guest_usec: _, + guest_nice_usec: _, + }, + ) => { + let idle_usec = sub_or_zero(curr_idle, prev_idle); + let iowait_usec = sub_or_zero(curr_iowait, prev_iowait); + let user_usec = sub_or_zero(curr_user, prev_user); + let system_usec = sub_or_zero(curr_system, prev_system); + let nice_usec = sub_or_zero(curr_nice, prev_nice); + let irq_usec = sub_or_zero(curr_irq, prev_irq); + let softirq_usec = sub_or_zero(curr_softirq, prev_softirq); + let stolen_usec = sub_or_zero(curr_stolen, prev_stolen); + + let busy_usec = + user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; + let total_usec = idle_usec + busy_usec + iowait_usec; + busy_usec as f64 / total_usec as f64 + } + _ => { + bail!("Some procfs stats are not populated!"); + } + }; + + self.prev_total_cpu = total_cpu; + Ok(busy) + } + + fn read_bpf_stats(&mut self) -> Result> { + let mut maps = self.skel.maps_mut(); + let stats_map = maps.stats(); + let mut stats: Vec = Vec::new(); + let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.top.nr_cpus]; + + for stat in 0..bpf_intf::stat_idx_RUSTY_NR_STATS { + let cpu_stat_vec = stats_map + .lookup_percpu(&stat.to_ne_bytes(), libbpf_rs::MapFlags::ANY) + .with_context(|| format!("Failed to lookup stat {}", stat))? + .expect("per-cpu stat should exist"); + let sum = cpu_stat_vec + .iter() + .map(|val| { + u64::from_ne_bytes( + val.as_slice() + .try_into() + .expect("Invalid value length in stat map"), + ) + }) + .sum(); + stats_map + .update_percpu(&stat.to_ne_bytes(), &zero_vec, libbpf_rs::MapFlags::ANY) + .context("Failed to zero stat")?; + stats.push(sum); + } + Ok(stats) + } + + fn report( + &mut self, + stats: &[u64], + cpu_busy: f64, + processing_dur: Duration, + load_avg: f64, + dom_loads: &[f64], + imbal: &[f64], + ) { + let stat = |idx| stats[idx as usize]; + let total = stat(bpf_intf::stat_idx_RUSTY_STAT_WAKE_SYNC) + + stat(bpf_intf::stat_idx_RUSTY_STAT_PREV_IDLE) + + stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_IDLE) + + stat(bpf_intf::stat_idx_RUSTY_STAT_PINNED) + + stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_DISPATCH) + + stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY) + + stat(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR) + + stat(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH) + + stat(bpf_intf::stat_idx_RUSTY_STAT_GREEDY); + + info!( + "cpu={:7.2} bal={} load_avg={:8.2} task_err={} lb_data_err={} proc={:?}ms", + cpu_busy * 100.0, + stats[bpf_intf::stat_idx_RUSTY_STAT_LOAD_BALANCE as usize], + load_avg, + stats[bpf_intf::stat_idx_RUSTY_STAT_TASK_GET_ERR as usize], + self.nr_lb_data_errors, + processing_dur.as_millis(), + ); + + let stat_pct = |idx| stat(idx) as f64 / total as f64 * 100.0; + + info!( + "tot={:7} wsync={:5.2} prev_idle={:5.2} greedy_idle={:5.2} pin={:5.2}", + total, + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_WAKE_SYNC), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_PREV_IDLE), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY_IDLE), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_PINNED), + ); + + info!( + "dir={:5.2} dir_greedy={:5.2} dir_greedy_far={:5.2}", + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_DISPATCH), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR), + ); + + info!( + "dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}", + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_DSQ_DISPATCH), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_GREEDY), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_KICK_GREEDY), + stat_pct(bpf_intf::stat_idx_RUSTY_STAT_REPATRIATE), + ); + + let ti = &self.skel.bss().tune_input; + info!( + "direct_greedy_cpumask={}", + format_cpumask(&ti.direct_greedy_cpumask, self.top.nr_cpus) + ); + info!( + " kick_greedy_cpumask={}", + format_cpumask(&ti.kick_greedy_cpumask, self.top.nr_cpus) + ); + + for i in 0..self.top.nr_doms { + info!( + "DOM[{:02}] util={:6.2} load={:8.2} imbal={}", + i, + self.tuner.dom_utils[i] * 100.0, + dom_loads[i], + if imbal[i] == 0.0 { + format!("{:9.2}", 0.0) + } else { + format!("{:+9.2}", imbal[i]) + }, + ); + } + } + + fn lb_step(&mut self) -> Result<()> { + let started_at = Instant::now(); + let bpf_stats = self.read_bpf_stats()?; + let cpu_busy = self.get_cpu_busy()?; + + let mut lb = LoadBalancer::new( + &mut self.skel, + self.top.clone(), + self.balanced_kworkers, + &mut self.nr_lb_data_errors, + ); + + lb.read_dom_loads()?; + lb.calculate_dom_load_balance()?; + + if self.balance_load { + lb.load_balance()?; + } + + // Extract fields needed for reporting and drop lb to release + // mutable borrows. + let (load_avg, dom_loads, imbal) = (lb.load_avg, lb.dom_loads, lb.imbal); + + self.report( + &bpf_stats, + cpu_busy, + Instant::now().duration_since(started_at), + load_avg, + &dom_loads, + &imbal, + ); + + self.prev_at = started_at; + Ok(()) + } + + fn read_bpf_exit_kind(&mut self) -> i32 { + unsafe { std::ptr::read_volatile(&self.skel.bss().exit_kind as *const _) } + } + + fn report_bpf_exit_kind(&mut self) -> Result<()> { + // Report msg if EXT_OPS_EXIT_ERROR. + match self.read_bpf_exit_kind() { + 0 => Ok(()), + etype if etype == 2 => { + let cstr = unsafe { CStr::from_ptr(self.skel.bss().exit_msg.as_ptr() as *const _) }; + let msg = cstr + .to_str() + .context("Failed to convert exit msg to string") + .unwrap(); + bail!("BPF exit_kind={} msg={}", etype, msg); + } + etype => { + info!("BPF exit_kind={}", etype); + Ok(()) + } + } + } + + fn run(&mut self, shutdown: Arc) -> Result<()> { + let now = Instant::now(); + let mut next_tune_at = now + self.tune_interval; + let mut next_sched_at = now + self.sched_interval; + + while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_kind() == 0 { + let now = Instant::now(); + + if now >= next_tune_at { + self.tuner.step(&mut self.skel)?; + next_tune_at += self.tune_interval; + if next_tune_at < now { + next_tune_at = now + self.tune_interval; + } + } + + if now >= next_sched_at { + self.lb_step()?; + next_sched_at += self.sched_interval; + if next_sched_at < now { + next_sched_at = now + self.sched_interval; + } + } + + std::thread::sleep( + next_sched_at + .min(next_tune_at) + .duration_since(Instant::now()), + ); + } + + self.report_bpf_exit_kind() + } +} + +impl<'a> Drop for Scheduler<'a> { + fn drop(&mut self) { + if let Some(struct_ops) = self.struct_ops.take() { + drop(struct_ops); + } + } +} + +fn main() -> Result<()> { + let opts = Opts::parse(); + + let llv = match opts.verbose { + 0 => simplelog::LevelFilter::Info, + 1 => simplelog::LevelFilter::Debug, + _ => simplelog::LevelFilter::Trace, + }; + let mut lcfg = simplelog::ConfigBuilder::new(); + lcfg.set_time_level(simplelog::LevelFilter::Error) + .set_location_level(simplelog::LevelFilter::Off) + .set_target_level(simplelog::LevelFilter::Off) + .set_thread_level(simplelog::LevelFilter::Off); + simplelog::TermLogger::init( + llv, + lcfg.build(), + simplelog::TerminalMode::Stderr, + simplelog::ColorChoice::Auto, + )?; + + let mut sched = Scheduler::init(&opts)?; + + let shutdown = Arc::new(AtomicBool::new(false)); + let shutdown_clone = shutdown.clone(); + ctrlc::set_handler(move || { + shutdown_clone.store(true, Ordering::Relaxed); + }) + .context("Error setting Ctrl-C handler")?; + + sched.run(shutdown) +} diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py new file mode 100644 index 0000000000000..d457d2a74e1ef --- /dev/null +++ b/tools/sched_ext/scx_show_state.py @@ -0,0 +1,39 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2024 Tejun Heo +# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. + +desc = """ +This is a drgn script to show the current sched_ext state. +For more info on drgn, visit https://github.com/osandov/drgn. +""" + +import drgn +import sys + +def err(s): + print(s, file=sys.stderr, flush=True) + sys.exit(1) + +def read_int(name): + return int(prog[name].value_()) + +def read_atomic(name): + return prog[name].counter.value_() + +def read_static_key(name): + return prog[name].key.enabled.counter.value_() + +def ops_state_str(state): + return prog['scx_ops_enable_state_str'][state].string_().decode() + +ops = prog['scx_ops'] +enable_state = read_atomic("scx_ops_enable_state_var") + +print(f'ops : {ops.name.string_().decode()}') +print(f'enabled : {read_static_key("__scx_ops_enabled")}') +print(f'switching_all : {read_int("scx_switching_all")}') +print(f'switched_all : {read_static_key("__scx_switched_all")}') +print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') +print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') +print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c new file mode 100644 index 0000000000000..95035aa29b10e --- /dev/null +++ b/tools/sched_ext/scx_simple.bpf.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A simple scheduler. + * + * By default, it operates as a simple global weighted vtime scheduler and can + * be switched to FIFO scheduling. It also demonstrates the following niceties. + * + * - Statistics tracking how many tasks are queued to local and global dsq's. + * - Termination notification for userspace. + * + * While very simple, this scheduler should work reasonably well on CPUs with a + * uniform L3 cache topology. While preemption is not implemented, the fact that + * the scheduling queue is shared across all CPUs means that whatever is at the + * front of the queue is likely to be executed fairly quickly given enough + * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads + * but comes with the usual problems with FIFO scheduling where saturating + * threads can easily drown out interactive ones. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include + +char _license[] SEC("license") = "GPL"; + +const volatile bool fifo_sched; +const volatile bool switch_partial; + +static u64 vtime_now; +struct user_exit_info uei; + +#define SHARED_DSQ 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* count local queueing */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* count global queueing */ + + if (fifo_sched) { + scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, + enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + if (!switch_partial) + scx_bpf_switch_all(); + + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops simple_ops = { + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .dispatch = (void *)simple_dispatch, + .running = (void *)simple_running, + .stopping = (void *)simple_stopping, + .enable = (void *)simple_enable, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple", +}; diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c new file mode 100644 index 0000000000000..5c5589770a2fc --- /dev/null +++ b/tools/sched_ext/scx_simple.c @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include "scx_simple.bpf.skel.h" + +const char help_fmt[] = +"A simple sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-f] [-p]\n" +"\n" +" -f Use FIFO scheduling instead of weighted vtime scheduling\n" +" -p Switch only tasks on SCHED_EXT policy intead of all\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void read_stats(struct scx_simple *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_simple *skel; + struct bpf_link *link; + __u32 opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + skel = scx_simple__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + while ((opt = getopt(argc, argv, "fph")) != -1) { + switch (opt) { + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'p': + skel->rodata->switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_BUG_ON(scx_simple__load(skel), "Failed to load skel"); + + link = bpf_map__attach_struct_ops(skel->maps.simple_ops); + SCX_BUG_ON(!link, "Failed to attach struct_ops"); + + while (!exit_req && !uei_exited(&skel->bss->uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu global=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + uei_print(&skel->bss->uei); + scx_simple__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c new file mode 100644 index 0000000000000..4cdc3a6fb880a --- /dev/null +++ b/tools/sched_ext/scx_userland.bpf.c @@ -0,0 +1,349 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal userland scheduler. + * + * In terms of scheduling, this provides two different types of behaviors: + * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. + * All such tasks are direct-dispatched from the kernel, and are never + * enqueued in user space. + * 2. A primitive vruntime scheduler that is implemented in user space, for all + * other tasks. + * + * Some parts of this example user space scheduler could be implemented more + * efficiently using more complex and sophisticated data structures. For + * example, rather than using BPF_MAP_TYPE_QUEUE's, + * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between + * user space and kernel space. Similarly, we use a simple vruntime-sorted list + * in user space, but an rbtree could be used instead. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include "scx_userland.h" + +/* + * Maximum amount of tasks enqueued/dispatched between kernel and user-space. + */ +#define MAX_ENQUEUED_TASKS 4096 + +char _license[] SEC("license") = "GPL"; + +const volatile bool switch_partial; +const volatile s32 usersched_pid; + +/* !0 for veristat, set during init */ +const volatile u32 num_possible_cpus = 64; + +/* Stats that are printed by user space. */ +u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; + +/* + * Number of tasks that are queued for scheduling. + * + * This number is incremented by the BPF component when a task is queued to the + * user-space scheduler and it must be decremented by the user-space scheduler + * when a task is consumed. + */ +volatile u64 nr_queued; + +/* + * Number of tasks that are waiting for scheduling. + * + * This number must be updated by the user-space scheduler to keep track if + * there is still some scheduling work to do. + */ +volatile u64 nr_scheduled; + +struct user_exit_info uei; + +/* + * The map containing tasks that are enqueued in user space from the kernel. + * + * This map is drained by the user space scheduler. + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_ENQUEUED_TASKS); + __type(value, struct scx_userland_enqueued_task); +} enqueued SEC(".maps"); + +/* + * The map containing tasks that are dispatched to the kernel from user space. + * + * Drained by the kernel in userland_dispatch(). + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, MAX_ENQUEUED_TASKS); + __type(value, s32); +} dispatched SEC(".maps"); + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local DSQ */ +}; + +/* Map that contains task-local storage. */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* + * Flag used to wake-up the user-space scheduler. + */ +static volatile u32 usersched_needed; + +/* + * Set user-space scheduler wake-up flag (equivalent to an atomic release + * operation). + */ +static void set_usersched_needed(void) +{ + __sync_fetch_and_or(&usersched_needed, 1); +} + +/* + * Check and clear user-space scheduler wake-up flag (equivalent to an atomic + * acquire operation). + */ +static bool test_and_clear_usersched_needed(void) +{ + return __sync_fetch_and_and(&usersched_needed, 0) == 1; +} + +static bool is_usersched_task(const struct task_struct *p) +{ + return p->pid == usersched_pid; +} + +static bool keep_in_kernel(const struct task_struct *p) +{ + return p->nr_cpus_allowed < num_possible_cpus; +} + +static struct task_struct *usersched_task(void) +{ + struct task_struct *p; + + p = bpf_task_from_pid(usersched_pid); + /* + * Should never happen -- the usersched task should always be managed + * by sched_ext. + */ + if (!p) + scx_bpf_error("Failed to find usersched task %d", usersched_pid); + + return p; +} + +s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + if (keep_in_kernel(p)) { + s32 cpu; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to look up task-local storage for %s", p->comm); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + tctx->force_local = true; + return cpu; + } + } + + return prev_cpu; +} + +static void dispatch_user_scheduler(void) +{ + struct task_struct *p; + + p = usersched_task(); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) +{ + struct scx_userland_enqueued_task task = {}; + + task.pid = p->pid; + task.sum_exec_runtime = p->se.sum_exec_runtime; + task.weight = p->scx.weight; + + if (bpf_map_push_elem(&enqueued, &task, 0)) { + /* + * If we fail to enqueue the task in user space, put it + * directly on the global DSQ. + */ + __sync_fetch_and_add(&nr_failed_enqueues, 1); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } else { + __sync_fetch_and_add(&nr_user_enqueues, 1); + set_usersched_needed(); + } +} + +void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (keep_in_kernel(p)) { + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to lookup task ctx for %s", p->comm); + return; + } + + if (tctx->force_local) + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); + __sync_fetch_and_add(&nr_kernel_enqueues, 1); + return; + } else if (!is_usersched_task(p)) { + enqueue_task_in_user_space(p, enq_flags); + } +} + +void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) +{ + if (test_and_clear_usersched_needed()) + dispatch_user_scheduler(); + + bpf_repeat(MAX_ENQUEUED_TASKS) { + s32 pid; + struct task_struct *p; + + if (bpf_map_pop_elem(&dispatched, &pid)) + break; + + /* + * The task could have exited by the time we get around to + * dispatching it. Treat this as a normal occurrence, and simply + * move onto the next iteration. + */ + p = bpf_task_from_pid(pid); + if (!p) + continue; + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +/* + * A CPU is about to change its idle state. If the CPU is going idle, ensure + * that the user-space scheduler has a chance to run if there is any remaining + * work to do. + */ +void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle) +{ + /* + * Don't do anything if we exit from and idle state, a CPU owner will + * be assigned in .running(). + */ + if (!idle) + return; + /* + * A CPU is now available, notify the user-space scheduler that tasks + * can be dispatched, if there is at least one task waiting to be + * scheduled, either queued (accounted in nr_queued) or scheduled + * (accounted in nr_scheduled). + * + * NOTE: nr_queued is incremented by the BPF component, more exactly in + * enqueue(), when a task is sent to the user-space scheduler, then + * the scheduler drains the queued tasks (updating nr_queued) and adds + * them to its internal data structures / state; at this point tasks + * become "scheduled" and the user-space scheduler will take care of + * updating nr_scheduled accordingly; lastly tasks will be dispatched + * and the user-space scheduler will update nr_scheduled again. + * + * Checking both counters allows to determine if there is still some + * pending work to do for the scheduler: new tasks have been queued + * since last check, or there are still tasks "queued" or "scheduled" + * since the previous user-space scheduler run. If the counters are + * both zero it is pointless to wake-up the scheduler (even if a CPU + * becomes idle), because there is nothing to do. + * + * Keep in mind that update_idle() doesn't run concurrently with the + * user-space scheduler (that is single-threaded): this function is + * naturally serialized with the user-space scheduler code, therefore + * this check here is also safe from a concurrency perspective. + */ + if (nr_queued || nr_scheduled) { + /* + * Kick the CPU to make it immediately ready to accept + * dispatched tasks. + */ + set_usersched_needed(); + scx_bpf_kick_cpu(cpu, 0); + } +} + +s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(userland_init) +{ + if (num_possible_cpus == 0) { + scx_bpf_error("User scheduler # CPUs uninitialized (%d)", + num_possible_cpus); + return -EINVAL; + } + + if (usersched_pid <= 0) { + scx_bpf_error("User scheduler pid uninitialized (%d)", + usersched_pid); + return -EINVAL; + } + + if (!switch_partial) + scx_bpf_switch_all(); + return 0; +} + +void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops userland_ops = { + .select_cpu = (void *)userland_select_cpu, + .enqueue = (void *)userland_enqueue, + .dispatch = (void *)userland_dispatch, + .update_idle = (void *)userland_update_idle, + .init_task = (void *)userland_init_task, + .init = (void *)userland_init, + .exit = (void *)userland_exit, + .flags = SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE, + .timeout_ms = 3000, + .name = "userland", +}; diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c new file mode 100644 index 0000000000000..368acd0b38bd9 --- /dev/null +++ b/tools/sched_ext/scx_userland.c @@ -0,0 +1,423 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext user space scheduler which provides vruntime semantics + * using a simple ordered-list implementation. + * + * Each CPU in the system resides in a single, global domain. This precludes + * the need to do any load balancing between domains. The scheduler could + * easily be extended to support multiple domains, with load balancing + * happening in user space. + * + * Any task which has any CPU affinity is scheduled entirely in BPF. This + * program only schedules tasks which may run on any CPU. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "scx_userland.h" +#include "scx_userland.bpf.skel.h" + +const char help_fmt[] = +"A minimal userland sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n" +"\n" +"Usage: %s [-b BATCH] [-p]\n" +"\n" +" -b BATCH The number of tasks to batch when dispatching (default: 8)\n" +" -p Don't switch all, switch only tasks on SCHED_EXT policy\n" +" -h Display this help and exit\n"; + +/* Defined in UAPI */ +#define SCHED_EXT 7 + +/* Number of tasks to batch when dispatching to user space. */ +static __u32 batch_size = 8; + +static volatile int exit_req; +static int enqueued_fd, dispatched_fd; + +static struct scx_userland *skel; +static struct bpf_link *ops_link; + +/* Stats collected in user space. */ +static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed; + +/* Number of tasks currently enqueued. */ +static __u64 nr_curr_enqueued; + +/* The data structure containing tasks that are enqueued in user space. */ +struct enqueued_task { + LIST_ENTRY(enqueued_task) entries; + __u64 sum_exec_runtime; + double vruntime; +}; + +/* + * Use a vruntime-sorted list to store tasks. This could easily be extended to + * a more optimal data structure, such as an rbtree as is done in CFS. We + * currently elect to use a sorted list to simplify the example for + * illustrative purposes. + */ +LIST_HEAD(listhead, enqueued_task); + +/* + * A vruntime-sorted list of tasks. The head of the list contains the task with + * the lowest vruntime. That is, the task that has the "highest" claim to be + * scheduled. + */ +static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); + +/* + * The main array of tasks. The array is allocated all at once during + * initialization, based on /proc/sys/kernel/pid_max, to avoid having to + * dynamically allocate memory on the enqueue path, which could cause a + * deadlock. A more substantive user space scheduler could e.g. provide a hook + * for newly enabled tasks that are passed to the scheduler from the + * .prep_enable() callback to allows the scheduler to allocate on safe paths. + */ +struct enqueued_task *tasks; +static int pid_max; + +static double min_vruntime; + +static void sigint_handler(int userland) +{ + exit_req = 1; +} + +static int get_pid_max(void) +{ + FILE *fp; + int pid_max; + + fp = fopen("/proc/sys/kernel/pid_max", "r"); + if (fp == NULL) { + fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n"); + return -1; + } + if (fscanf(fp, "%d", &pid_max) != 1) { + fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n"); + fclose(fp); + return -1; + } + fclose(fp); + + return pid_max; +} + +static int init_tasks(void) +{ + pid_max = get_pid_max(); + if (pid_max < 0) + return pid_max; + + tasks = calloc(pid_max, sizeof(*tasks)); + if (!tasks) { + fprintf(stderr, "Error allocating tasks array\n"); + return -ENOMEM; + } + + return 0; +} + +static __u32 task_pid(const struct enqueued_task *task) +{ + return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); +} + +static int dispatch_task(__s32 pid) +{ + int err; + + err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); + if (err) { + nr_vruntime_failed++; + } else { + nr_vruntime_dispatches++; + } + + return err; +} + +static struct enqueued_task *get_enqueued_task(__s32 pid) +{ + if (pid >= pid_max) + return NULL; + + return &tasks[pid]; +} + +static double calc_vruntime_delta(__u64 weight, __u64 delta) +{ + double weight_f = (double)weight / 100.0; + double delta_f = (double)delta; + + return delta_f / weight_f; +} + +static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) +{ + __u64 delta; + + delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; + + enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); + if (min_vruntime > enqueued->vruntime) + enqueued->vruntime = min_vruntime; + enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; +} + +static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) +{ + struct enqueued_task *curr, *enqueued, *prev; + + curr = get_enqueued_task(bpf_task->pid); + if (!curr) + return ENOENT; + + update_enqueued(curr, bpf_task); + nr_vruntime_enqueues++; + nr_curr_enqueued++; + + /* + * Enqueue the task in a vruntime-sorted list. A more optimal data + * structure such as an rbtree could easily be used as well. We elect + * to use a list here simply because it's less code, and thus the + * example is less convoluted and better serves to illustrate what a + * user space scheduler could look like. + */ + + if (LIST_EMPTY(&vruntime_head)) { + LIST_INSERT_HEAD(&vruntime_head, curr, entries); + return 0; + } + + LIST_FOREACH(enqueued, &vruntime_head, entries) { + if (curr->vruntime <= enqueued->vruntime) { + LIST_INSERT_BEFORE(enqueued, curr, entries); + return 0; + } + prev = enqueued; + } + + LIST_INSERT_AFTER(prev, curr, entries); + + return 0; +} + +static void drain_enqueued_map(void) +{ + while (1) { + struct scx_userland_enqueued_task task; + int err; + + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) { + skel->bss->nr_queued = 0; + skel->bss->nr_scheduled = nr_curr_enqueued; + return; + } + + err = vruntime_enqueue(&task); + if (err) { + fprintf(stderr, "Failed to enqueue task %d: %s\n", + task.pid, strerror(err)); + exit_req = 1; + return; + } + } +} + +static void dispatch_batch(void) +{ + __u32 i; + + for (i = 0; i < batch_size; i++) { + struct enqueued_task *task; + int err; + __s32 pid; + + task = LIST_FIRST(&vruntime_head); + if (!task) + break; + + min_vruntime = task->vruntime; + pid = task_pid(task); + LIST_REMOVE(task, entries); + err = dispatch_task(pid); + if (err) { + /* + * If we fail to dispatch, put the task back to the + * vruntime_head list and stop dispatching additional + * tasks in this batch. + */ + LIST_INSERT_HEAD(&vruntime_head, task, entries); + break; + } + nr_curr_enqueued--; + } + skel->bss->nr_scheduled = nr_curr_enqueued; +} + +static void *run_stats_printer(void *arg) +{ + while (!exit_req) { + __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; + + nr_failed_enqueues = skel->bss->nr_failed_enqueues; + nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; + nr_user_enqueues = skel->bss->nr_user_enqueues; + total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; + + printf("o-----------------------o\n"); + printf("| BPF ENQUEUES |\n"); + printf("|-----------------------|\n"); + printf("| kern: %10llu |\n", nr_kernel_enqueues); + printf("| user: %10llu |\n", nr_user_enqueues); + printf("| failed: %10llu |\n", nr_failed_enqueues); + printf("| -------------------- |\n"); + printf("| total: %10llu |\n", total); + printf("| |\n"); + printf("|-----------------------|\n"); + printf("| VRUNTIME / USER |\n"); + printf("|-----------------------|\n"); + printf("| enq: %10llu |\n", nr_vruntime_enqueues); + printf("| disp: %10llu |\n", nr_vruntime_dispatches); + printf("| failed: %10llu |\n", nr_vruntime_failed); + printf("o-----------------------o\n"); + printf("\n\n"); + fflush(stdout); + sleep(1); + } + + return NULL; +} + +static int spawn_stats_thread(void) +{ + pthread_t stats_printer; + + return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); +} + +static void bootstrap(int argc, char **argv) +{ + int err; + __u32 opt; + struct sched_param sched_param = { + .sched_priority = sched_get_priority_max(SCHED_EXT), + }; + bool switch_partial = false; + + err = init_tasks(); + if (err) + exit(err); + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + /* + * Enforce that the user scheduler task is managed by sched_ext. The + * task eagerly drains the list of enqueued tasks in its main work + * loop, and then yields the CPU. The BPF scheduler only schedules the + * user space scheduler task when at least one other task in the system + * needs to be scheduled. + */ + err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); + SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); + + while ((opt = getopt(argc, argv, "b:ph")) != -1) { + switch (opt) { + case 'b': + batch_size = strtoul(optarg, NULL, 0); + break; + case 'p': + switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + exit(opt != 'h'); + } + } + + /* + * It's not always safe to allocate in a user space scheduler, as an + * enqueued task could hold a lock that we require in order to be able + * to allocate. + */ + err = mlockall(MCL_CURRENT | MCL_FUTURE); + SCX_BUG_ON(err, "Failed to prefault and lock address space"); + + skel = scx_userland__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->num_possible_cpus > 0); + skel->rodata->usersched_pid = getpid(); + assert(skel->rodata->usersched_pid > 0); + skel->rodata->switch_partial = switch_partial; + + SCX_BUG_ON(scx_userland__load(skel), "Failed to load skel"); + + enqueued_fd = bpf_map__fd(skel->maps.enqueued); + dispatched_fd = bpf_map__fd(skel->maps.dispatched); + assert(enqueued_fd > 0); + assert(dispatched_fd > 0); + + SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); + + ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops); + SCX_BUG_ON(!ops_link, "Failed to attach struct_ops"); +} + +static void sched_main_loop(void) +{ + while (!exit_req) { + /* + * Perform the following work in the main user space scheduler + * loop: + * + * 1. Drain all tasks from the enqueued map, and enqueue them + * to the vruntime sorted list. + * + * 2. Dispatch a batch of tasks from the vruntime sorted list + * down to the kernel. + * + * 3. Yield the CPU back to the system. The BPF scheduler will + * reschedule the user space scheduler once another task has + * been enqueued to user space. + */ + drain_enqueued_map(); + dispatch_batch(); + sched_yield(); + } +} + +int main(int argc, char **argv) +{ + bootstrap(argc, argv); + sched_main_loop(); + + exit_req = 1; + bpf_link__destroy(ops_link); + uei_print(&skel->bss->uei); + scx_userland__destroy(skel); + return 0; +} diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h new file mode 100644 index 0000000000000..684fb2dd5de96 --- /dev/null +++ b/tools/sched_ext/scx_userland.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta, Inc */ + +#ifndef __SCX_USERLAND_COMMON_H +#define __SCX_USERLAND_COMMON_H + +/* + * An instance of a task that has been enqueued by the kernel for consumption + * by a user space global scheduler thread. + */ +struct scx_userland_enqueued_task { + __s32 pid; + u64 sum_exec_runtime; + u64 weight; +}; + +#endif // __SCX_USERLAND_COMMON_H diff --git a/tools/testing/selftests/scx/.gitignore b/tools/testing/selftests/scx/.gitignore new file mode 100644 index 0000000000000..2c077082b67a5 --- /dev/null +++ b/tools/testing/selftests/scx/.gitignore @@ -0,0 +1,14 @@ +ddsp_bogus_dsq_fail +ddsp_vtimelocal_fail +enq_last_no_enq_fails +enq_select_cpu_fails +init_enable_count +minimal +runner +select_cpu_dfl +select_cpu_dfl_nodispatch +select_cpu_dispatch +select_cpu_dispatch_dbl_dsp +select_cpu_dispatch_bad_dsq +select_cpu_vtime +build/ diff --git a/tools/testing/selftests/scx/Makefile b/tools/testing/selftests/scx/Makefile new file mode 100644 index 0000000000000..e7ec3397bcc5b --- /dev/null +++ b/tools/testing/selftests/scx/Makefile @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../../../build/Build.include +include ../../../scripts/Makefile.arch +include ../../../scripts/Makefile.include +include ../lib.mk + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := gcc +endif # LLVM + +ifneq ($(CROSS_COMPILE),) +$(error CROSS_COMPILE not supported for scx selftests) +endif # CROSS_COMPILE + +CURDIR := $(abspath .) +REPOROOT := $(abspath ../../../..) +TOOLSDIR := $(REPOROOT)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(REPOROOT)/include/generated +GENHDR := $(GENDIR)/autoconf.h +SCXTOOLSDIR := $(TOOLSDIR)/sched_ext +SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include + +OUTPUT_DIR := $(CURDIR)/build +OBJ_DIR := $(OUTPUT_DIR)/obj +INCLUDE_DIR := $(OUTPUT_DIR)/include +BPFOBJ_DIR := $(OBJ_DIR)/libbpf +SCXOBJ_DIR := $(OBJ_DIR)/sched_ext +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a +DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool +HOST_BUILD_DIR := $(OBJ_DIR) +HOST_OUTPUT_DIR := $(OUTPUT_DIR) + +VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread -lzstd + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) + $(call msg,CLNG-BPF,,$(notdir $@)) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) + $(eval sched=$(notdir $@)) + $(call msg,GEN-SKEL,,$(sched)) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) + +################ +# C schedulers # +################ + +override define CLEAN + rm -rf $(OUTPUT_DIR) + rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h + rm -f $(TEST_GEN_PROGS) + rm -f runner +endef + +auto-test-targets := \ + enq_last_no_enq_fails \ + enq_select_cpu_fails \ + ddsp_bogus_dsq_fail \ + ddsp_vtimelocal_fail \ + init_enable_count \ + maybe_null \ + minimal \ + select_cpu_dfl \ + select_cpu_dfl_nodispatch \ + select_cpu_dispatch \ + select_cpu_dispatch_bad_dsq \ + select_cpu_dispatch_dbl_dsp \ + select_cpu_vtime \ + test_example + +testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) + +$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) + $(CC) $(CFLAGS) -c $< -o $@ + +# Create all of the test targets object files, whose testcase objects will be +# registered into the runner in ELF constructors. +# +# Note that we must do double expansion here in order to support conditionally +# compiling BPF object files only if one is present, as the wildcard Make +# function doesn't support using implicit rules otherwise. +.SECONDEXPANSION: +$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o \ + $$(if $$(wildcard $$*.bpf.c), $(INCLUDE_DIR)/%.bpf.skel.h) \ + $$(if $$(wildcard $$*_fail.bpf.c), $(INCLUDE_DIR)/%_fail.bpf.skel.h) \ + | $(SCXOBJ_DIR) + $(eval test=$(patsubst %.o,%.c,$(notdir $@))) + $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o + +runner: $(SCXOBJ_DIR)/runner.o $(BPFOBJ) $(testcase-targets) + @echo "$(testcase-targets)" + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + +TEST_GEN_PROGS := runner + +all: runner + +.PHONY: all clean help + +.DEFAULT_GOAL := all + +.DELETE_ON_ERROR: + +.SECONDARY: diff --git a/tools/testing/selftests/scx/config b/tools/testing/selftests/scx/config new file mode 100644 index 0000000000000..fef8f81ad3776 --- /dev/null +++ b/tools/testing/selftests/scx/config @@ -0,0 +1,5 @@ +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_EXT_GROUP_SCHED=y diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c new file mode 100644 index 0000000000000..dd32b189911ef --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.bpf.c @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* + * If we dispatch to a bogus DSQ that will fall back to the + * builtin global DSQ, we fail gracefully. + */ + scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { + .select_cpu = ddsp_bogus_dsq_fail_select_cpu, + .exit = ddsp_bogus_dsq_fail_exit, + .init = ddsp_bogus_dsq_fail_init, + .name = "ddsp_bogus_dsq_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c new file mode 100644 index 0000000000000..ef8ee04ff9871 --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_bogus_dsq_fail.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "ddsp_bogus_dsq_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_bogus_dsq_fail *skel; + + skel = ddsp_bogus_dsq_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_bogus_dsq_fail *skel = ctx; + + ddsp_bogus_dsq_fail__destroy(skel); +} + +struct scx_test ddsp_bogus_dsq_fail = { + .name = "ddsp_bogus_dsq_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct dispatch to an invalid" + " DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c new file mode 100644 index 0000000000000..9b21c1d57861c --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.bpf.c @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + + if (cpu >= 0) { + /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ + scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, + p->scx.dsq_vtime, 0); + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops ddsp_vtimelocal_fail_ops = { + .select_cpu = ddsp_vtimelocal_fail_select_cpu, + .init = ddsp_vtimelocal_fail_init, + .exit = ddsp_vtimelocal_fail_exit, + .name = "ddsp_vtimelocal_fail", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c new file mode 100644 index 0000000000000..b55611cd0b1fb --- /dev/null +++ b/tools/testing/selftests/scx/ddsp_vtimelocal_fail.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include "ddsp_vtimelocal_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct ddsp_vtimelocal_fail *skel; + + skel = ddsp_vtimelocal_fail__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct ddsp_vtimelocal_fail *skel = ctx; + + ddsp_vtimelocal_fail__destroy(skel); +} + +struct scx_test ddsp_vtimelocal_fail = { + .name = "ddsp_vtimelocal_fail", + .description = "Verify we gracefully fail, and fall back to using a " + "built-in DSQ, if we do a direct vtime dispatch to a " + "built-in DSQ from DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) diff --git a/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c new file mode 100644 index 0000000000000..4b0f84568dc15 --- /dev/null +++ b/tools/testing/selftests/scx/enq_last_no_enq_fails.bpf.c @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(enq_last_no_enq_fails_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_last_no_enq_fails_ops = { + .init = enq_last_no_enq_fails_init, + .name = "enq_last_no_enq_fails", + /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ + .flags = SCX_OPS_ENQ_LAST, + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/enq_last_no_enq_fails.c b/tools/testing/selftests/scx/enq_last_no_enq_fails.c new file mode 100644 index 0000000000000..2a3eda5e2c0b4 --- /dev/null +++ b/tools/testing/selftests/scx/enq_last_no_enq_fails.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_last_no_enq_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_last_no_enq_fails *skel; + + skel = enq_last_no_enq_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); + if (link) { + SCX_ERR("Incorrectly succeeded in to attaching scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_last_no_enq_fails *skel = ctx; + + enq_last_no_enq_fails__destroy(skel); +} + +struct scx_test enq_last_no_enq_fails = { + .name = "enq_last_no_enq_fails", + .description = "Verify we fail to load a scheduler if we specify " + "the SCX_OPS_ENQ_LAST flag without defining " + "ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_last_no_enq_fails) diff --git a/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c new file mode 100644 index 0000000000000..40ea393b2bbc9 --- /dev/null +++ b/tools/testing/selftests/scx/enq_select_cpu_fails.bpf.c @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + return prev_cpu; +} + +void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, + u64 enq_flags) +{ + /* + * Need to initialize the variable or the verifier will fail to load. + * Improving these semantics is actively being worked on. + */ + bool found = false; + + /* Can only call from ops.select_cpu() */ + scx_bpf_select_cpu_dfl(p, 0, 0, &found); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(enq_select_cpu_fails_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_select_cpu_fails_ops = { + .select_cpu = enq_select_cpu_fails_select_cpu, + .enqueue = enq_select_cpu_fails_enqueue, + .init = enq_select_cpu_fails_init, + .name = "enq_select_cpu_fails", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/enq_select_cpu_fails.c b/tools/testing/selftests/scx/enq_select_cpu_fails.c new file mode 100644 index 0000000000000..dd1350e5f002d --- /dev/null +++ b/tools/testing/selftests/scx/enq_select_cpu_fails.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "enq_select_cpu_fails.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_select_cpu_fails *skel; + + skel = enq_select_cpu_fails__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + sleep(1); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_select_cpu_fails *skel = ctx; + + enq_select_cpu_fails__destroy(skel); +} + +struct scx_test enq_select_cpu_fails = { + .name = "enq_select_cpu_fails", + .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " + "from ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_select_cpu_fails) diff --git a/tools/testing/selftests/scx/init_enable_count.bpf.c b/tools/testing/selftests/scx/init_enable_count.bpf.c new file mode 100644 index 0000000000000..8ad8fdf4ad608 --- /dev/null +++ b/tools/testing/selftests/scx/init_enable_count.bpf.c @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that verifies that we do proper counting of init, enable, etc + * callbacks. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; +volatile const bool switch_all; + +s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + __sync_fetch_and_add(&init_task_cnt, 1); + + return 0; +} + +void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) +{ + __sync_fetch_and_add(&exit_task_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) +{ + __sync_fetch_and_add(&enable_cnt, 1); +} + +void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) +{ + __sync_fetch_and_add(&disable_cnt, 1); +} + +s32 BPF_STRUCT_OPS(cnt_init) +{ + if (switch_all) + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops init_enable_count_ops = { + .init_task = cnt_init_task, + .exit_task = cnt_exit_task, + .enable = cnt_enable, + .disable = cnt_disable, + .init = cnt_init, + .name = "init_enable_count", +}; diff --git a/tools/testing/selftests/scx/init_enable_count.c b/tools/testing/selftests/scx/init_enable_count.c new file mode 100644 index 0000000000000..671e3366e67d2 --- /dev/null +++ b/tools/testing/selftests/scx/init_enable_count.c @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include +#include +#include "scx_test.h" +#include "init_enable_count.bpf.skel.h" + +#define SCHED_EXT 7 + +static struct init_enable_count * +open_load_prog(bool global) +{ + struct init_enable_count *skel; + + skel = init_enable_count__open(); + SCX_BUG_ON(!skel, "Failed to open skel"); + + if (global) + skel->rodata->switch_all = global; + + SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); + + return skel; +} + +static enum scx_test_status run_test(bool global) +{ + struct init_enable_count *skel; + struct bpf_link *link; + const u32 num_children = 5; + int ret, i, status; + struct sched_param param = {}; + pid_t pids[num_children]; + + skel = open_load_prog(global); + link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops"); + + /* SCHED_EXT children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); + + if (pids[i] == 0) { + ret = sched_setscheduler(0, SCHED_EXT, ¶m); + SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); + + /* + * Reset to SCHED_OTHER for half of them. Counts for + * everything should still be the same regardless, as + * ops.disable() is invoked even if a task is still on + * SCHED_EXT before it exits. + */ + if (i % 2 == 0) { + ret = sched_setscheduler(0, SCHED_OTHER, ¶m); + SCX_BUG_ON(ret, "Failed to reset sched to normal"); + } + exit(0); + } + } + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for SCX child\n"); + + SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, + status); + } + + /* SCHED_OTHER children */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + if (pids[i] == 0) + exit(0); + } + + for (i = 0; i < num_children; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for normal child\n"); + + SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, + status); + } + + sleep(1); + + SCX_GE(skel->bss->init_task_cnt, 2 * num_children); + SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); + + if (global) { + SCX_GE(skel->bss->enable_cnt, 2 * num_children); + SCX_GE(skel->bss->disable_cnt, 2 * num_children); + } else { + SCX_EQ(skel->bss->enable_cnt, num_children); + SCX_EQ(skel->bss->disable_cnt, num_children); + } + + bpf_link__destroy(link); + init_enable_count__destroy(skel); + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + enum scx_test_status status; + + status = run_test(true); + if (status != SCX_TEST_PASS) + return status; + + return run_test(false); +} + +struct scx_test init_enable_count = { + .name = "init_enable_count", + .description = "Verify we do the correct amount of counting of init, " + "enable, etc callbacks.", + .run = run, +}; +REGISTER_SCX_TEST(&init_enable_count) diff --git a/tools/testing/selftests/scx/maybe_null.bpf.c b/tools/testing/selftests/scx/maybe_null.bpf.c new file mode 100644 index 0000000000000..1e9b1fdedc88a --- /dev/null +++ b/tools/testing/selftests/scx/maybe_null.bpf.c @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) +{ + if (p != NULL) + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_success = { + .dispatch = maybe_null_success_dispatch, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/scx/maybe_null.c b/tools/testing/selftests/scx/maybe_null.c new file mode 100644 index 0000000000000..4f093a5ee4de8 --- /dev/null +++ b/tools/testing/selftests/scx/maybe_null.c @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ +#include +#include +#include +#include +#include "maybe_null.bpf.skel.h" +#include "maybe_null_fail.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status run(void *ctx) +{ + struct maybe_null *skel; + struct maybe_null_fail *fail_skel; + + skel = maybe_null__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load maybe_null skel"); + return SCX_TEST_FAIL; + } + maybe_null__destroy(skel); + + fail_skel = maybe_null_fail__open_and_load(); + if (fail_skel) { + maybe_null_fail__destroy(fail_skel); + SCX_ERR("Should failed to open and load maybe_null_fail skel"); + return SCX_TEST_FAIL; + } + + return SCX_TEST_PASS; +} + +struct scx_test maybe_null = { + .name = "maybe_null", + .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .run = run, +}; +REGISTER_SCX_TEST(&maybe_null) diff --git a/tools/testing/selftests/scx/maybe_null_fail.bpf.c b/tools/testing/selftests/scx/maybe_null_fail.bpf.c new file mode 100644 index 0000000000000..bc99c13ce5839 --- /dev/null +++ b/tools/testing/selftests/scx/maybe_null_fail.bpf.c @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + */ + +#include + +char _license[] SEC("license") = "GPL"; + +u64 vtime_test; + +void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) +{} + +void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) +{ + vtime_test = p->scx.dsq_vtime; +} + +SEC(".struct_ops.link") +struct sched_ext_ops maybe_null_fail = { + .dispatch = maybe_null_fail_dispatch, + .enable = maybe_null_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/scx/minimal.bpf.c b/tools/testing/selftests/scx/minimal.bpf.c new file mode 100644 index 0000000000000..14b3d44d90db5 --- /dev/null +++ b/tools/testing/selftests/scx/minimal.bpf.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A completely minimal scheduler. + * + * This scheduler defines the absolute minimal set of struct sched_ext_ops + * fields: its name (and until a bug is fixed in libbpf, also an ops.running() + * callback). It should _not_ fail to be loaded, and can be used to exercise + * the default scheduling paths in ext.c. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +void BPF_STRUCT_OPS(minimal_running, struct task_struct *p) +{} + +SEC(".struct_ops.link") +struct sched_ext_ops minimal_ops = { + /* + * It shouldn't be necessary to define this minimal_running op, but + * libbpf currently expects that a struct_ops map will always have at + * least one struct_ops prog when loading. Until that issue is fixed, + * let's also define a minimal prog so that we can load and test. + */ + .enable = minimal_running, + .name = "minimal", +}; diff --git a/tools/testing/selftests/scx/minimal.c b/tools/testing/selftests/scx/minimal.c new file mode 100644 index 0000000000000..6c5db8ebbf8ac --- /dev/null +++ b/tools/testing/selftests/scx/minimal.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct minimal *skel; + + skel = minimal__open_and_load(); + if (!skel) { + SCX_ERR("Failed to open and load skel"); + return SCX_TEST_FAIL; + } + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct minimal *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct minimal *skel = ctx; + + minimal__destroy(skel); +} + +struct scx_test minimal = { + .name = "minimal", + .description = "Verify we can load a fully minimal scheduler", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&minimal) diff --git a/tools/testing/selftests/scx/runner.c b/tools/testing/selftests/scx/runner.c new file mode 100644 index 0000000000000..17262c30b96de --- /dev/null +++ b/tools/testing/selftests/scx/runner.c @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include +#include "scx_test.h" + +const char help_fmt[] = +"The runner for sched_ext tests.\n" +"\n" +"The runner is statically linked against all testcases, and runs them all serially.\n" +"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" +"scheduler may be loaded at any given time." +"\n" +"Usage: %s [-t TEST] [-h]\n" +"\n" +" -t TEST Only run tests whose name includes this string\n" +" -q Don't print the test descriptions during run\n" +" -h Display this help and exit\n"; + +static volatile int exit_req; +static bool quiet; + +#define MAX_SCX_TESTS 2048 + +static struct scx_test __scx_tests[MAX_SCX_TESTS]; +static unsigned __scx_num_tests = 0; + +static void sigint_handler(int simple) +{ + exit_req = 1; +} + +static void print_test_preamble(const struct scx_test *test, bool quiet) +{ + printf("===== START =====\n"); + printf("TEST: %s\n", test->name); + if (!quiet) + printf("DESCRIPTION: %s\n", test->description); + printf("OUTPUT:\n"); +} + +static const char *status_to_result(enum scx_test_status status) +{ + switch (status) { + case SCX_TEST_PASS: + case SCX_TEST_SKIP: + return "ok"; + case SCX_TEST_FAIL: + return "not ok"; + } + + return NULL; +} + +static void print_test_result(const struct scx_test *test, + enum scx_test_status status, + unsigned int testnum) +{ + const char *result = status_to_result(status); + const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; + + printf("%s %u %s # %s\n", result, testnum, test->name, directive); + printf("===== END =====\n"); +} + +static bool should_skip_test(const struct scx_test *test, const char * filter) +{ + return !strstr(test->name, filter); +} + +static enum scx_test_status run_test(const struct scx_test *test) +{ + enum scx_test_status status; + void *context = NULL; + + if (test->setup) { + status = test->setup(&context); + if (status != SCX_TEST_PASS) + return status; + } + + status = test->run(context); + + if (test->cleanup) + test->cleanup(context); + + return status; +} + +static bool test_valid(const struct scx_test *test) +{ + if (!test) { + fprintf(stderr, "NULL test detected\n"); + return false; + } + + if (!test->name) { + fprintf(stderr, + "Test with no name found. Must specify test name.\n"); + return false; + } + + if (!test->description) { + fprintf(stderr, "Test %s requires description.\n", test->name); + return false; + } + + if (!test->run) { + fprintf(stderr, "Test %s has no run() callback\n", test->name); + return false; + } + + return true; +} + +int main(int argc, char **argv) +{ + const char *filter = NULL; + unsigned testnum = 0, i; + unsigned passed = 0, skipped = 0, failed = 0; + int opt; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + while ((opt = getopt(argc, argv, "qt:h")) != -1) { + switch (opt) { + case 'q': + quiet = true; + break; + case 't': + filter = optarg; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + for (i = 0; i < __scx_num_tests; i++) { + enum scx_test_status status; + struct scx_test *test = &__scx_tests[i]; + + print_test_preamble(test, quiet); + + if (filter && should_skip_test(test, filter)) { + print_test_result(test, SCX_TEST_SKIP, ++testnum); + continue; + } + + status = run_test(test); + print_test_result(test, status, ++testnum); + switch (status) { + case SCX_TEST_PASS: + passed++; + break; + case SCX_TEST_SKIP: + skipped++; + break; + case SCX_TEST_FAIL: + failed++; + break; + } + } + printf("\n\n=============================\n\n"); + printf("RESULTS:\n\n"); + printf("PASSED: %u\n", passed); + printf("SKIPPED: %u\n", skipped); + printf("FAILED: %u\n", failed); + + return 0; +} + +void scx_test_register(struct scx_test *test) +{ + SCX_BUG_ON(!test_valid(test), "Invalid test found"); + SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); + + __scx_tests[__scx_num_tests++] = *test; +} diff --git a/tools/testing/selftests/scx/scx_test.h b/tools/testing/selftests/scx/scx_test.h new file mode 100644 index 0000000000000..4b70bf75fa814 --- /dev/null +++ b/tools/testing/selftests/scx/scx_test.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 Tejun Heo + * Copyright (c) 2023 David Vernet + */ + +#ifndef __SCX_TEST_H__ +#define __SCX_TEST_H__ + +#include +#include + +enum scx_test_status { + SCX_TEST_PASS = 0, + SCX_TEST_SKIP, + SCX_TEST_FAIL, +}; + +/* Copied from include/linux/sched/ext.h */ +enum scx_test_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* BPF unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +struct scx_test { + /** + * name - The name of the testcase. + */ + const char *name; + + /** + * description - A description of your testcase: what it tests and is + * meant to validate. + */ + const char *description; + + /* + * setup - Setup the test. + * @ctx: A pointer to a context object that will be passed to run and + * cleanup. + * + * An optional callback that allows a testcase to perform setup for its + * run. A test may return SCX_TEST_SKIP to skip the run. + */ + enum scx_test_status (*setup)(void **ctx); + + /* + * run - Run the test. + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * The main test. Callers should return one of: + * + * - SCX_TEST_PASS: Test passed + * - SCX_TEST_SKIP: Test should be skipped + * - SCX_TEST_FAIL: Test failed + * + * This callback must be defined. + */ + enum scx_test_status (*run)(void *ctx); + + /* + * cleanup - Perform cleanup following the test + * @ctx: Context set in the setup() callback. If @ctx was not set in + * setup(), it is NULL. + * + * An optional callback that allows a test to perform cleanup after + * being run. This callback is run even if the run() callback returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns + * SCX_TEST_SKIP or SCX_TEST_FAIL. + */ + void (*cleanup)(void *ctx); +}; + +void scx_test_register(struct scx_test *test); + +#define REGISTER_SCX_TEST(__test) \ + __attribute__((constructor)) \ + static void ___scxregister##__LINE__(void) \ + { \ + scx_test_register(__test); \ + } + +#define SCX_ERR(__fmt, ...) \ + do { \ + fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ + fprintf(stderr, __fmt, ##__VA_ARGS__); \ + } while (0) + +#define SCX_FAIL(__fmt, ...) \ + do { \ + SCX_ERR(__fmt, ##__VA_ARGS__); \ + return SCX_TEST_FAIL; \ + } while (0) + +#define SCX_FAIL_IF(__cond, __fmt, ...) \ + do { \ + if (__cond) \ + SCX_FAIL(__fmt, ##__VA_ARGS__); \ + } while (0) + +#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ + #_x, #_y, (u64)(_x), (u64)(_y)) +#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ + #_x, (u64)(_x)) + +#endif // # __SCX_TEST_H__ diff --git a/tools/testing/selftests/scx/select_cpu_dfl.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c new file mode 100644 index 0000000000000..f2fa80628299b --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl.bpf.c @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +static bool task_is_test(const struct task_struct *p) +{ + return !bpf_strncmp(p->comm, 9, "select_cpu"); +} + +void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, + u64 enq_flags) +{ + const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); + + if (task_is_test(p) && + bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { + saw_local = true; + } + scx_bpf_put_idle_cpumask(idle_mask); + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_ops = { + .enqueue = select_cpu_dfl_enqueue, + .init = select_cpu_dfl_init, + .name = "select_cpu_dfl", +}; diff --git a/tools/testing/selftests/scx/select_cpu_dfl.c b/tools/testing/selftests/scx/select_cpu_dfl.c new file mode 100644 index 0000000000000..a53a40c2d2f0f --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl *skel; + + skel = select_cpu_dfl__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(!skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl *skel = ctx; + + select_cpu_dfl__destroy(skel); +} + +struct scx_test select_cpu_dfl = { + .name = "select_cpu_dfl", + .description = "Verify the default ops.select_cpu() dispatches tasks " + "when idles cores are found, and skips ops.enqueue()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl) diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c new file mode 100644 index 0000000000000..636ea1de12fe0 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.bpf.c @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag + * specified. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +bool saw_local = false; + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* CPU changed by ops.select_cpu() */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +/* Manually specify the signature until the kfunc is added to the scx repo. */ +s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + bool *found) __ksym; + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + s32 cpu; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return -ESRCH; + } + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, + &tctx->force_local); + + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, + u64 enq_flags) +{ + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return; + } + + if (tctx->force_local) { + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + saw_local = true; + } + + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, + struct task_struct *p, struct scx_init_task_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { + .select_cpu = select_cpu_dfl_nodispatch_select_cpu, + .enqueue = select_cpu_dfl_nodispatch_enqueue, + .init_task = select_cpu_dfl_nodispatch_init_task, + .init = select_cpu_dfl_nodispatch_init, + .name = "select_cpu_dfl_nodispatch", +}; diff --git a/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c new file mode 100644 index 0000000000000..1d85bf4bf3a39 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dfl_nodispatch.c @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dfl_nodispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dfl_nodispatch *skel; + + skel = select_cpu_dfl_nodispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + SCX_ASSERT(skel->bss->saw_local); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dfl_nodispatch *skel = ctx; + + select_cpu_dfl_nodispatch__destroy(skel); +} + +struct scx_test select_cpu_dfl_nodispatch = { + .name = "select_cpu_dfl_nodispatch", + .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) diff --git a/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c new file mode 100644 index 0000000000000..0fda977697251 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch.bpf.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + u64 dsq_id = SCX_DSQ_LOCAL; + s32 cpu = prev_cpu; + + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + goto dispatch; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto dispatch; + + dsq_id = SCX_DSQ_GLOBAL; + cpu = prev_cpu; + +dispatch: + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); + return cpu; +} + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_ops = { + .select_cpu = select_cpu_dispatch_select_cpu, + .init = select_cpu_dispatch_init, + .name = "select_cpu_dispatch", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_dispatch.c b/tools/testing/selftests/scx/select_cpu_dispatch.c new file mode 100644 index 0000000000000..0309ca8785b36 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch.c @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch.bpf.skel.h" +#include "scx_test.h" + +#define NUM_CHILDREN 1028 + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch *skel; + + skel = select_cpu_dispatch__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + struct bpf_link *link; + pid_t pids[NUM_CHILDREN]; + int i, status; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + for (i = 0; i < NUM_CHILDREN; i++) { + pids[i] = fork(); + if (pids[i] == 0) { + sleep(1); + exit(0); + } + } + + for (i = 0; i < NUM_CHILDREN; i++) { + SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); + SCX_EQ(status, 0); + } + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch *skel = ctx; + + select_cpu_dispatch__destroy(skel); +} + +struct scx_test select_cpu_dispatch = { + .name = "select_cpu_dispatch", + .description = "Test direct dispatching to built-in DSQs from " + "ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch) diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c new file mode 100644 index 0000000000000..c9105add924d5 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.bpf.c @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching to a random DSQ should fail. */ + scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { + .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, + .init = select_cpu_dispatch_bad_dsq_init, + .exit = select_cpu_dispatch_bad_dsq_exit, + .name = "select_cpu_dispatch_bad_dsq", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c new file mode 100644 index 0000000000000..a7b91d58cb318 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_bad_dsq.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel; + + skel = select_cpu_dispatch_bad_dsq__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_bad_dsq *skel = ctx; + + select_cpu_dispatch_bad_dsq__destroy(skel); +} + +struct scx_test select_cpu_dispatch_bad_dsq = { + .name = "select_cpu_dispatch_bad_dsq", + .description = "Verify graceful failure if we direct-dispatch to a " + "bogus DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c new file mode 100644 index 0000000000000..82d8148399f28 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.bpf.c @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates the behavior of direct dispatching with a default + * select_cpu implementation. + * + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +struct user_exit_info uei; + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Dispatching twice in a row is disallowed. */ + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + return prev_cpu; +} + +s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_init) +{ + scx_bpf_switch_all(); + + return 0; +} + +void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { + .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, + .init = select_cpu_dispatch_dbl_dsp_init, + .exit = select_cpu_dispatch_dbl_dsp_exit, + .name = "select_cpu_dispatch_dbl_dsp", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c new file mode 100644 index 0000000000000..e32b229637448 --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_dispatch_dbl_dsp.c @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet + * Copyright (c) 2023 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel; + + skel = select_cpu_dispatch_dbl_dsp__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_EQ(skel->bss->uei.kind, SCX_EXIT_ERROR); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_dispatch_dbl_dsp *skel = ctx; + + select_cpu_dispatch_dbl_dsp__destroy(skel); +} + +struct scx_test select_cpu_dispatch_dbl_dsp = { + .name = "select_cpu_dispatch_dbl_dsp", + .description = "Verify graceful failure if we dispatch twice to a " + "DSQ in ops.select_cpu()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) diff --git a/tools/testing/selftests/scx/select_cpu_vtime.bpf.c b/tools/testing/selftests/scx/select_cpu_vtime.bpf.c new file mode 100644 index 0000000000000..b8bdadf3e541b --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_vtime.bpf.c @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A scheduler that validates that enqueue flags are properly stored and + * applied at dispatch time when a task is directly dispatched from + * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and + * making the test a very basic vtime scheduler. + * + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ + +#include + +char _license[] SEC("license") = "GPL"; + +volatile bool consumed; + +static u64 vtime_now; + +#define VTIME_DSQ 0 + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static inline u64 task_vtime(const struct task_struct *p) +{ + u64 vtime = p->scx.dsq_vtime; + + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + return vtime_now - SCX_SLICE_DFL; + else + return vtime; +} + +s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto ddsp; + + cpu = prev_cpu; + scx_bpf_test_and_clear_cpu_idle(cpu); +ddsp: + scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); + return cpu; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) +{ + if (scx_bpf_consume(VTIME_DSQ)) + consumed = true; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) +{ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, + bool runnable) +{ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) +{ + scx_bpf_switch_all(); + + return scx_bpf_create_dsq(VTIME_DSQ, -1); +} + +SEC(".struct_ops.link") +struct sched_ext_ops select_cpu_vtime_ops = { + .select_cpu = select_cpu_vtime_select_cpu, + .dispatch = select_cpu_vtime_dispatch, + .running = select_cpu_vtime_running, + .stopping = select_cpu_vtime_stopping, + .enable = select_cpu_vtime_enable, + .init = select_cpu_vtime_init, + .name = "select_cpu_vtime", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/scx/select_cpu_vtime.c b/tools/testing/selftests/scx/select_cpu_vtime.c new file mode 100644 index 0000000000000..b4629c2364f5d --- /dev/null +++ b/tools/testing/selftests/scx/select_cpu_vtime.c @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 David Vernet + * Copyright (c) 2024 Tejun Heo + */ +#include +#include +#include +#include +#include "select_cpu_vtime.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct select_cpu_vtime *skel; + + skel = select_cpu_vtime__open_and_load(); + SCX_FAIL_IF(!skel, "Failed to open and load skel"); + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + struct bpf_link *link; + + SCX_ASSERT(!skel->bss->consumed); + + link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + sleep(1); + + SCX_ASSERT(skel->bss->consumed); + + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct select_cpu_vtime *skel = ctx; + + select_cpu_vtime__destroy(skel); +} + +struct scx_test select_cpu_vtime = { + .name = "select_cpu_vtime", + .description = "Test doing direct vtime-dispatching from " + "ops.select_cpu(), to a non-built-in DSQ", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&select_cpu_vtime) diff --git a/tools/testing/selftests/scx/test_example.c b/tools/testing/selftests/scx/test_example.c new file mode 100644 index 0000000000000..ce36cdf03cdc5 --- /dev/null +++ b/tools/testing/selftests/scx/test_example.c @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet + */ +#include +#include +#include "scx_test.h" + +static bool setup_called = false; +static bool run_called = false; +static bool cleanup_called = false; + +static int context = 10; + +static enum scx_test_status setup(void **ctx) +{ + setup_called = true; + *ctx = &context; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + int *arg = ctx; + + SCX_ASSERT(setup_called); + SCX_ASSERT(!run_called && !cleanup_called); + SCX_EQ(*arg, context); + + run_called = true; + return SCX_TEST_PASS; +} + +static void cleanup (void *ctx) +{ + SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); +} + +struct scx_test example = { + .name = "example", + .description = "Validate the basic function of the test suite itself", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&example)