From db7b5f17b61d493ec5b2e4cbb645db6239c6b2b6 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 22 May 2024 13:54:06 -0500 Subject: [PATCH] scx: Invoke ops.exit_task() for TASK_DEAD tasks on disable path In commit f26683180087 ("scx: Close a small race window in the enable path which can lead to use-after-free"), we fixed a race window on the enable path that could cause a crash. This fix is fine, but was a bit too aggressive in that it could also cause us to miss ops.exit_task() invocations in the following scenario: 1. A task exits and invokes do_task_dead() (making its state TASK_DEAD), but someone still holds a refcount on it somewhere. 2. The scheduler is disabled. 3. On the disable path, we don't invoke ops.task_exit() 4. We don't invoke it in sched_ext_free() either later, because by then the scheduler has been disabled. Let's ensure we don't skip on exiting the task by still calling scx_ops_exit_task() for TASK_DEAD tasks on the disable path. Signed-off-by: David Vernet --- kernel/sched/ext.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2d497712c7b7..56309010b627 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1333,14 +1333,16 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) } /** - * scx_task_iter_next_filtered_locked - Next alive non-idle task with its rq locked + * scx_task_iter_next_locked - Next non-idle task with its rq locked * @iter: iterator to walk + * @include_dead: Whether we should include dead tasks in the iteration * - * Visit the next alive non-idle task with its rq lock held. See - * scx_task_iter_init() for details. + * Visit the non-idle task with its rq lock held. Allows callers to specify + * whether they would like to filter out dead tasks. See scx_task_iter_init() + * for details. */ static struct task_struct * -scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) +scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) { struct task_struct *p; retry: @@ -1367,7 +1369,7 @@ scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) * the final __schedle() while we're locking its rq and thus will stay * alive until the rq is unlocked. */ - if (READ_ONCE(p->__state) == TASK_DEAD) + if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) goto retry; return p; @@ -4393,19 +4395,31 @@ static void scx_ops_disable_workfn(struct kthread_work *work) spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); - while ((p = scx_task_iter_next_filtered_locked(&sti))) { + /* + * Invoke scx_ops_exit_task() on all non-idle tasks, including + * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, + * we may not have invoked sched_ext_free() on them by the time a + * scheduler is disabled. We must therefore exit the task here, or we'd + * fail to invoke ops.exit_task(), as the scheduler will have been + * unloaded by the time the task is subsequently exited on the + * sched_ext_free() path. + */ + while ((p = scx_task_iter_next_locked(&sti, true))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (READ_ONCE(p->__state) != TASK_DEAD) { + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, + &ctx); - p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); - __setscheduler_prio(p, p->prio); - check_class_changing(task_rq(p), p, old_class); + p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); + __setscheduler_prio(p, p->prio); + check_class_changing(task_rq(p), p, old_class); - sched_enq_and_set_task(&ctx); + sched_enq_and_set_task(&ctx); - check_class_changed(task_rq(p), p, old_class, p->prio); + check_class_changed(task_rq(p), p, old_class, p->prio); + } scx_ops_exit_task(p); } scx_task_iter_exit(&sti); @@ -5034,7 +5048,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops) spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); - while ((p = scx_task_iter_next_filtered_locked(&sti))) { + while ((p = scx_task_iter_next_locked(&sti, false))) { get_task_struct(p); scx_task_iter_rq_unlock(&sti); spin_unlock_irq(&scx_tasks_lock); @@ -5088,7 +5102,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops) WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); scx_task_iter_init(&sti); - while ((p = scx_task_iter_next_filtered_locked(&sti))) { + while ((p = scx_task_iter_next_locked(&sti, false))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx;