From f266831800876268f4ea294fc48b93b0292ae8f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 17 May 2024 13:43:51 -1000 Subject: [PATCH] scx: Close a small race window in the enable path which can lead to use-after-free scx_ops_enable() was iterating all tasks without locking its rq and then calling get_task_struct() on them. However, this can race against task free path. The task struct itself is accessible but its refcnt may already have reached zero by the time get_task_struct() is called. This triggers the following refcnt warning. WARNING: CPU: 11 PID: 2834521 at lib/refcount.c:25 refcount_warn_saturate+0x7d/0xe0 ... RIP: 0010:refcount_warn_saturate+0x7d/0xe0 ... Call Trace: bpf_scx_reg+0x851/0x890 bpf_struct_ops_link_create+0xbd/0xe0 __sys_bpf+0x2862/0x2d20 __x64_sys_bpf+0x18/0x20 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This can also affect other loops as it can call sched functions on tasks which are already destroyed. Let's fix it by making scx_task_iter_filtered_locked() also filter out TASK_DEAD tasks and switching the scx_task_iter_filtered() usage in scx_ops_enable() to scx_task_iter_filtered_locked(). This makes the next function always test TASK_DEAD while holding the task's rq lock and the rq lock guarantees that the task won't be freed closing the race window. --- kernel/sched/ext.c | 81 +++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1160bed2895b..2d497712c7b7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1268,6 +1268,28 @@ static void scx_task_iter_init(struct scx_task_iter *iter) iter->locked = NULL; } +/** + * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator + * @iter: iterator to unlock rq for + * + * If @iter is in the middle of a locked iteration, it may be locking the rq of + * the task currently being visited. Unlock the rq if so. This function can be + * safely called anytime during an iteration. + * + * Returns %true if the rq @iter was locking is unlocked. %false if @iter was + * not locking an rq. + */ +static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + return true; + } else { + return false; + } +} + /** * scx_task_iter_exit - Exit a task iterator * @iter: iterator to exit @@ -1278,19 +1300,10 @@ static void scx_task_iter_init(struct scx_task_iter *iter) */ static void scx_task_iter_exit(struct scx_task_iter *iter) { - struct list_head *cursor = &iter->cursor.tasks_node; - lockdep_assert_held(&scx_tasks_lock); - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; - } - - if (list_empty(cursor)) - return; - - list_del_init(cursor); + scx_task_iter_rq_unlock(iter); + list_del_init(&iter->cursor.tasks_node); } /** @@ -1320,15 +1333,18 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) } /** - * scx_task_iter_next_filtered - Next non-idle task + * scx_task_iter_next_filtered_locked - Next alive non-idle task with its rq locked * @iter: iterator to walk * - * Visit the next non-idle task. See scx_task_iter_init() for details. + * Visit the next alive non-idle task with its rq lock held. See + * scx_task_iter_init() for details. */ static struct task_struct * -scx_task_iter_next_filtered(struct scx_task_iter *iter) +scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) { struct task_struct *p; +retry: + scx_task_iter_rq_unlock(iter); while ((p = scx_task_iter_next(iter))) { /* @@ -1336,34 +1352,24 @@ scx_task_iter_next_filtered(struct scx_task_iter *iter) * which haven't yet been onlined. Test sched_class directly. */ if (p->sched_class != &idle_sched_class) - return p; - } - return NULL; -} - -/** - * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked - * @iter: iterator to walk - * - * Visit the next non-idle task with its rq lock held. See scx_task_iter_init() - * for details. - */ -static struct task_struct * -scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) -{ - struct task_struct *p; - - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; + break; } - - p = scx_task_iter_next_filtered(iter); if (!p) return NULL; iter->rq = task_rq_lock(p, &iter->rf); iter->locked = p; + + /* + * If we see %TASK_DEAD, @p already disabled preemption, is about to do + * the final __schedule(), won't ever need to be scheduled again and can + * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter + * the final __schedle() while we're locking its rq and thus will stay + * alive until the rq is unlocked. + */ + if (READ_ONCE(p->__state) == TASK_DEAD) + goto retry; + return p; } @@ -5028,8 +5034,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops) spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); - while ((p = scx_task_iter_next_filtered(&sti))) { + while ((p = scx_task_iter_next_filtered_locked(&sti))) { get_task_struct(p); + scx_task_iter_rq_unlock(&sti); spin_unlock_irq(&scx_tasks_lock); ret = scx_ops_init_task(p, task_group(p), false);