From f266831800876268f4ea294fc48b93b0292ae8f3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 May 2024 13:43:51 -1000
Subject: [PATCH] scx: Close a small race window in the enable path which can
 lead to use-after-free

scx_ops_enable() was iterating all tasks without locking its rq and then
calling get_task_struct() on them. However, this can race against task free
path. The task struct itself is accessible but its refcnt may already have
reached zero by the time get_task_struct() is called. This triggers the
following refcnt warning.

  WARNING: CPU: 11 PID: 2834521 at lib/refcount.c:25 refcount_warn_saturate+0x7d/0xe0
  ...
  RIP: 0010:refcount_warn_saturate+0x7d/0xe0
  ...
  Call Trace:
   bpf_scx_reg+0x851/0x890
   bpf_struct_ops_link_create+0xbd/0xe0
   __sys_bpf+0x2862/0x2d20
   __x64_sys_bpf+0x18/0x20
   do_syscall_64+0x3d/0x90
   entry_SYSCALL_64_after_hwframe+0x46/0xb0

This can also affect other loops as it can call sched functions on tasks
which are already destroyed. Let's fix it by making
scx_task_iter_filtered_locked() also filter out TASK_DEAD tasks and
switching the scx_task_iter_filtered() usage in scx_ops_enable() to
scx_task_iter_filtered_locked(). This makes the next function always test
TASK_DEAD while holding the task's rq lock and the rq lock guarantees that
the task won't be freed closing the race window.
---
 kernel/sched/ext.c | 81 +++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1160bed2895b..2d497712c7b7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1268,6 +1268,28 @@ static void scx_task_iter_init(struct scx_task_iter *iter)
 	iter->locked = NULL;
 }
 
+/**
+ * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
+ * @iter: iterator to unlock rq for
+ *
+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
+ * the task currently being visited. Unlock the rq if so. This function can be
+ * safely called anytime during an iteration.
+ *
+ * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
+ * not locking an rq.
+ */
+static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+{
+	if (iter->locked) {
+		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
+		iter->locked = NULL;
+		return true;
+	} else {
+		return false;
+	}
+}
+
 /**
  * scx_task_iter_exit - Exit a task iterator
  * @iter: iterator to exit
@@ -1278,19 +1300,10 @@ static void scx_task_iter_init(struct scx_task_iter *iter)
  */
 static void scx_task_iter_exit(struct scx_task_iter *iter)
 {
-	struct list_head *cursor = &iter->cursor.tasks_node;
-
 	lockdep_assert_held(&scx_tasks_lock);
 
-	if (iter->locked) {
-		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
-		iter->locked = NULL;
-	}
-
-	if (list_empty(cursor))
-		return;
-
-	list_del_init(cursor);
+	scx_task_iter_rq_unlock(iter);
+	list_del_init(&iter->cursor.tasks_node);
 }
 
 /**
@@ -1320,15 +1333,18 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
 }
 
 /**
- * scx_task_iter_next_filtered - Next non-idle task
+ * scx_task_iter_next_filtered_locked - Next alive non-idle task with its rq locked
  * @iter: iterator to walk
  *
- * Visit the next non-idle task. See scx_task_iter_init() for details.
+ * Visit the next alive non-idle task with its rq lock held. See
+ * scx_task_iter_init() for details.
  */
 static struct task_struct *
-scx_task_iter_next_filtered(struct scx_task_iter *iter)
+scx_task_iter_next_filtered_locked(struct scx_task_iter *iter)
 {
 	struct task_struct *p;
+retry:
+	scx_task_iter_rq_unlock(iter);
 
 	while ((p = scx_task_iter_next(iter))) {
 		/*
@@ -1336,34 +1352,24 @@ scx_task_iter_next_filtered(struct scx_task_iter *iter)
 		 * which haven't yet been onlined. Test sched_class directly.
 		 */
 		if (p->sched_class != &idle_sched_class)
-			return p;
-	}
-	return NULL;
-}
-
-/**
- * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked
- * @iter: iterator to walk
- *
- * Visit the next non-idle task with its rq lock held. See scx_task_iter_init()
- * for details.
- */
-static struct task_struct *
-scx_task_iter_next_filtered_locked(struct scx_task_iter *iter)
-{
-	struct task_struct *p;
-
-	if (iter->locked) {
-		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
-		iter->locked = NULL;
+			break;
 	}
-
-	p = scx_task_iter_next_filtered(iter);
 	if (!p)
 		return NULL;
 
 	iter->rq = task_rq_lock(p, &iter->rf);
 	iter->locked = p;
+
+	/*
+	 * If we see %TASK_DEAD, @p already disabled preemption, is about to do
+	 * the final __schedule(), won't ever need to be scheduled again and can
+	 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
+	 * the final __schedle() while we're locking its rq and thus will stay
+	 * alive until the rq is unlocked.
+	 */
+	if (READ_ONCE(p->__state) == TASK_DEAD)
+		goto retry;
+
 	return p;
 }
 
@@ -5028,8 +5034,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	spin_lock_irq(&scx_tasks_lock);
 
 	scx_task_iter_init(&sti);
-	while ((p = scx_task_iter_next_filtered(&sti))) {
+	while ((p = scx_task_iter_next_filtered_locked(&sti))) {
 		get_task_struct(p);
+		scx_task_iter_rq_unlock(&sti);
 		spin_unlock_irq(&scx_tasks_lock);
 
 		ret = scx_ops_init_task(p, task_group(p), false);