sched: introduce Work as the unified scheduleable unit

Refactor the scheduler so all scheduleable work is wrapped in Arc<Work>,
replacing the previous per-CPU wait_q design where sleeping tasks were
bound to a specific CPU. Wakers now hold direct Arc<Work> references and
can re-enqueue tasks on any CPU upon wakeup.

Key changes:

- Add Work struct wrapping OwnedTask with an AtomicTaskState and
  scheduler metadata (SchedulerData), replacing the old SchedulableTask.
  Remove Task::state (Arc<SpinLock<TaskState>>). Work::state is now the
  single source of truth for task state.

- Rewrite the run queue using BinaryHeap-based eligible/ineligible split
  (EEVDF) with a dedicated VClock, replacing the BTreeMap linear scan.
  Extract vclock into its own module.

- Rewrite wakers to hold Arc<Work> directly instead of looking up tasks
  by TaskDescriptor from TASK_LIST.

- Replace lock-based sleep transitions in uspc_ret with atomic CAS
  (try_sleep_current) that correctly detects concurrent Woken state.

- Simplify least-tasked-CPU metric to use only run-queue weight, since
  sleeping tasks are no longer bound to any CPU.

- Add current_work() accessor.
This commit is contained in:
Matthew Leach
2026-03-10 22:00:56 +00:00
parent e12af349ad
commit 5ebfc29cd2
22 changed files with 825 additions and 623 deletions

View File

@@ -60,6 +60,7 @@ pub async fn sys_capget(hdrp: TUA<CapUserHeader>, datap: TUA<CapUserData>) -> Re
.iter()
.find(|task| task.0.tgid.value() == header.pid as u32)
.and_then(|task| task.1.upgrade())
.map(|x| x.t_shared.clone())
.ok_or(KernelError::NoProcess)?
};
match header.version {
@@ -95,6 +96,7 @@ pub async fn sys_capset(hdrp: TUA<CapUserHeader>, datap: TUA<CapUserData>) -> Re
.iter()
.find(|task| task.0.tgid.value() == header.pid as u32)
.and_then(|task| task.1.upgrade())
.map(|x| x.t_shared.clone())
.ok_or(KernelError::NoProcess)?
};

View File

@@ -1,10 +1,10 @@
use super::owned::OwnedTask;
use super::ptrace::{PTrace, TracePoint, ptrace_stop};
use super::{ctx::Context, thread_group::signal::SigSet};
use crate::kernel::cpu_id::CpuId;
use crate::memory::uaccess::copy_to_user;
use crate::sched::sched_task::Work;
use crate::{
process::{TASK_LIST, Task, TaskState},
process::{TASK_LIST, Task},
sched::{self, current::current_task},
sync::SpinLock,
};
@@ -170,8 +170,6 @@ pub async fn sys_clone(
cwd,
root,
creds: SpinLock::new(creds),
state: Arc::new(SpinLock::new(TaskState::Runnable)),
last_cpu: SpinLock::new(CpuId::this()),
ptrace: SpinLock::new(ptrace),
utime: AtomicUsize::new(0),
stime: AtomicUsize::new(0),
@@ -181,28 +179,29 @@ pub async fn sys_clone(
}
};
let tid = new_task.tid;
let desc = new_task.descriptor();
let work = Work::new(Box::new(new_task));
TASK_LIST
.lock_save_irq()
.insert(new_task.descriptor(), Arc::downgrade(&new_task.t_shared));
.insert(desc, Arc::downgrade(&work));
new_task
.process
work.process
.tasks
.lock_save_irq()
.insert(tid, Arc::downgrade(&new_task.t_shared));
.insert(desc.tid, Arc::downgrade(&work));
sched::insert_task_cross_cpu(work);
sched::insert_task_cross_cpu(Box::new(new_task));
NUM_FORKS.fetch_add(1, core::sync::atomic::Ordering::Relaxed);
// Honour CLONE_*SETTID semantics for the parent and (shared-VM) child.
if flags.contains(CloneFlags::CLONE_PARENT_SETTID) && !parent_tidptr.is_null() {
copy_to_user(parent_tidptr, tid.value()).await?;
copy_to_user(parent_tidptr, desc.tid.value()).await?;
}
if flags.contains(CloneFlags::CLONE_CHILD_SETTID) && !child_tidptr.is_null() {
copy_to_user(child_tidptr, tid.value()).await?;
copy_to_user(child_tidptr, desc.tid.value()).await?;
}
Ok(tid.value() as _)
Ok(desc.tid.value() as _)
}

View File

@@ -1,10 +1,10 @@
use super::{
TASK_LIST, TaskState,
TASK_LIST,
ptrace::{TracePoint, ptrace_stop},
thread_group::{ProcessState, Tgid, ThreadGroup, signal::SigId, wait::ChildState},
threading::futex::{self, key::FutexKey},
};
use crate::sched::current::current_task;
use crate::sched::{self, current::current_task};
use crate::{memory::uaccess::copy_to_user, sched::current::current_task_shared};
use alloc::vec::Vec;
use libkernel::error::Result;
@@ -34,7 +34,8 @@ pub fn do_exit_group(exit_code: ChildState) {
if *process_state != ProcessState::Running {
// We're already on our way out. Just kill this thread.
drop(process_state);
*task.state.lock_save_irq() = TaskState::Finished;
drop(task);
sched::current_work().state.finish();
return;
}
@@ -51,7 +52,7 @@ pub fn do_exit_group(exit_code: ChildState) {
// TODO: Send an IPI/Signal to halt execution now. For now, just
// wait for the scheduler to never schedule any of it's tasks
// again.
*other_thread.state.lock_save_irq() = TaskState::Finished;
other_thread.state.finish();
}
}
}
@@ -87,7 +88,8 @@ pub fn do_exit_group(exit_code: ChildState) {
.set_signal(SigId::SIGCHLD);
// 5. This thread is now finished.
*task.state.lock_save_irq() = TaskState::Finished;
drop(task);
sched::current_work().state.finish();
// NOTE: that the scheduler will never execute the task again since it's
// state is set to Finished.
@@ -151,7 +153,7 @@ pub async fn sys_exit(exit_code: usize) -> Result<usize> {
Ok(0)
} else {
// Mark our own state as finished.
*task.state.lock_save_irq() = TaskState::Finished;
sched::current_work().state.finish();
// Remove ourself from the process's thread list.
tasks_lock.remove(&task.tid);

View File

@@ -1,5 +1,6 @@
use crate::drivers::timer::Instant;
use crate::sched::CPU_STAT;
use crate::sched::sched_task::Work;
use crate::{
arch::ArchImpl,
kernel::cpu_id::CpuId,
@@ -14,7 +15,6 @@ use alloc::{
collections::btree_map::BTreeMap,
sync::{Arc, Weak},
};
use core::fmt::Display;
use core::sync::atomic::{AtomicUsize, Ordering};
use creds::Credentials;
use fd_table::FileDescriptorTable;
@@ -124,35 +124,6 @@ impl TaskDescriptor {
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TaskState {
Running,
Runnable,
Woken,
Stopped,
Sleeping,
Finished,
}
impl Display for TaskState {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let state_str = match self {
TaskState::Running => "R",
TaskState::Runnable => "R",
TaskState::Woken => "W",
TaskState::Stopped => "T",
TaskState::Sleeping => "S",
TaskState::Finished => "Z",
};
write!(f, "{state_str}")
}
}
impl TaskState {
pub fn is_finished(self) -> bool {
matches!(self, Self::Finished)
}
}
pub type ProcVM = ProcessVM<<ArchImpl as VirtualMemory>::ProcessAddressSpace>;
#[derive(Copy, Clone)]
@@ -184,8 +155,6 @@ pub struct Task {
pub root: Arc<SpinLock<(Arc<dyn Inode>, PathBuf)>>,
pub creds: SpinLock<Credentials>,
pub fd_table: Arc<SpinLock<FileDescriptorTable>>,
pub state: Arc<SpinLock<TaskState>>,
pub last_cpu: SpinLock<CpuId>,
pub ptrace: SpinLock<PTrace>,
pub utime: AtomicUsize,
pub stime: AtomicUsize,
@@ -308,7 +277,7 @@ impl Task {
}
}
pub fn find_task_by_descriptor(descriptor: &TaskDescriptor) -> Option<Arc<Task>> {
pub fn find_task_by_descriptor(descriptor: &TaskDescriptor) -> Option<Arc<Work>> {
TASK_LIST
.lock_save_irq()
.get(descriptor)
@@ -316,11 +285,11 @@ pub fn find_task_by_descriptor(descriptor: &TaskDescriptor) -> Option<Arc<Task>>
}
/// Finds the root task for the given thread group
pub fn find_process_by_tgid(tgid: Tgid) -> Option<Arc<Task>> {
pub fn find_process_by_tgid(tgid: Tgid) -> Option<Arc<Work>> {
find_task_by_descriptor(&TaskDescriptor::from_tgid_tid(tgid, Tid::from_tgid(tgid)))
}
pub static TASK_LIST: SpinLock<BTreeMap<TaskDescriptor, Weak<Task>>> =
pub static TASK_LIST: SpinLock<BTreeMap<TaskDescriptor, Weak<Work>>> =
SpinLock::new(BTreeMap::new());
unsafe impl Send for Task {}

View File

@@ -1,7 +1,5 @@
use core::ops::Deref;
use super::{
Comm, Task, TaskState, Tid,
Comm, Task, Tid,
creds::Credentials,
ctx::{Context, UserCtx},
fd_table::FileDescriptorTable,
@@ -13,14 +11,13 @@ use super::{
},
threading::RobustListHead,
};
use crate::drivers::timer::{Instant, now};
use crate::{arch::Arch, fs::DummyInode, sync::SpinLock};
use crate::{
arch::{Arch, ArchImpl},
fs::DummyInode,
kernel::cpu_id::CpuId,
sync::SpinLock,
arch::ArchImpl,
drivers::timer::{Instant, now},
};
use alloc::sync::Arc;
use core::ops::Deref;
use core::sync::atomic::AtomicUsize;
use libkernel::{
VirtualMemory,
@@ -72,13 +69,11 @@ impl OwnedTask {
tid: Tid::idle_for_cpu(),
comm: Arc::new(SpinLock::new(Comm::new("idle"))),
process: thread_group_builder.build(),
state: Arc::new(SpinLock::new(TaskState::Runnable)),
cwd: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
root: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
creds: SpinLock::new(Credentials::new_root()),
vm: Arc::new(SpinLock::new(vm)),
fd_table: Arc::new(SpinLock::new(FileDescriptorTable::new())),
last_cpu: SpinLock::new(CpuId::this()),
ptrace: SpinLock::new(PTrace::new()),
utime: AtomicUsize::new(0),
stime: AtomicUsize::new(0),
@@ -102,7 +97,6 @@ impl OwnedTask {
tid: Tid(1),
comm: Arc::new(SpinLock::new(Comm::new("init"))),
process: ThreadGroupBuilder::new(Tgid::init()).build(),
state: Arc::new(SpinLock::new(TaskState::Runnable)),
cwd: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
root: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
creds: SpinLock::new(Credentials::new_root()),
@@ -110,7 +104,6 @@ impl OwnedTask {
ProcessVM::empty().expect("Could not create init process's VM"),
)),
fd_table: Arc::new(SpinLock::new(FileDescriptorTable::new())),
last_cpu: SpinLock::new(CpuId::this()),
ptrace: SpinLock::new(PTrace::new()),
last_account: AtomicUsize::new(0),
utime: AtomicUsize::new(0),

View File

@@ -1,24 +1,25 @@
use core::future::poll_fn;
use core::task::{Poll, Waker};
use crate::arch::{Arch, ArchImpl};
use crate::fs::syscalls::iov::IoVec;
use crate::memory::uaccess::{copy_from_user, copy_to_user};
use crate::process::TASK_LIST;
use crate::process::thread_group::signal::SigId;
use crate::sched::current::{current_task, current_task_shared};
use super::thread_group::{ThreadGroup, wait::ChildState};
use crate::{
arch::{Arch, ArchImpl},
fs::syscalls::iov::IoVec,
memory::uaccess::{copy_from_user, copy_to_user},
process::{TASK_LIST, thread_group::signal::SigId},
sched::current::{current_task, current_task_shared},
};
use alloc::sync::Arc;
use bitflags::Flags;
use libkernel::error::{KernelError, Result};
use libkernel::memory::address::UA;
use core::{
future::poll_fn,
task::{Poll, Waker},
};
use libkernel::{
error::{KernelError, Result},
memory::address::UA,
};
use log::warn;
type GpRegs = <ArchImpl as Arch>::PTraceGpRegs;
use super::TaskState;
use super::thread_group::ThreadGroup;
use super::thread_group::wait::ChildState;
const PTRACE_EVENT_FORK: usize = 1;
const PTRACE_EVENT_VFORK: usize = 2;
const PTRACE_EVENT_CLONE: usize = 3;
@@ -43,7 +44,7 @@ bitflags::bitflags! {
const PTRACE_O_SUSPEND_SECCOMP = 1 << 21;
}
#[derive(Clone, Copy, PartialEq)]
#[derive(Clone, Copy, PartialEq, Debug)]
pub struct TracePoint: u32 {
const SyscallEntry = 0x01;
const SyscallExit = 0x02;
@@ -177,9 +178,6 @@ impl PTrace {
}
pub fn set_waker(&mut self, waker: Waker) {
// Ensure we never override an already existing waker.
debug_assert!(self.waker.is_none());
self.waker = Some(waker);
}
@@ -259,22 +257,29 @@ impl TryFrom<i32> for PtraceOperation {
pub async fn ptrace_stop(point: TracePoint) -> bool {
let task_sh = current_task_shared();
{
let mut ptrace = task_sh.ptrace.lock_save_irq();
if ptrace.hit_trace_point(point, current_task().ctx.user()) {
ptrace.notify_tracer_of_trap(&task_sh.process);
} else {
return false;
}
}
let mut notified = false;
poll_fn(|cx| {
let mut ptrace = task_sh.ptrace.lock_save_irq();
if matches!(ptrace.state, Some(PTraceState::Running)) {
if !notified {
// First poll: hit the trace point, set waker, then notify.
// The waker must be set *before* notification so the tracer
// can always find it when it does PTRACE_SYSCALL/CONT.
if !ptrace.hit_trace_point(point, current_task().ctx.user()) {
return Poll::Ready(false);
}
notified = true;
ptrace.set_waker(cx.waker().clone());
ptrace.notify_tracer_of_trap(&task_sh.process);
Poll::Pending
} else if matches!(ptrace.state, Some(PTraceState::Running)) {
// Tracer resumed us.
Poll::Ready(true)
} else {
// Re-polled (e.g. spurious wakeup from signal) but tracer
// hasn't resumed yet. Refresh the waker and go back to sleep.
ptrace.set_waker(cx.waker().clone());
Poll::Pending
}
@@ -381,7 +386,9 @@ pub async fn sys_ptrace(op: i32, pid: u64, addr: UA, data: UA) -> Result<usize>
.break_points
.remove(TracePoint::SyscallEntry | TracePoint::SyscallExit);
*target_task.state.lock_save_irq() = TaskState::Runnable;
if let Some(waker) = ptrace.waker.take() {
waker.wake();
}
Ok(0)
}

View File

@@ -1,5 +1,12 @@
use super::{Task, TaskState, Tid};
use crate::{memory::uaccess::UserCopyable, sched::waker::create_waker, sync::SpinLock};
use super::Tid;
use crate::{
memory::uaccess::UserCopyable,
sched::{
sched_task::{Work, state::TaskState},
waker::create_waker,
},
sync::SpinLock,
};
use alloc::{
collections::btree_map::BTreeMap,
sync::{Arc, Weak},
@@ -95,7 +102,7 @@ pub struct ThreadGroup {
pub umask: SpinLock<u32>,
pub parent: SpinLock<Option<Weak<ThreadGroup>>>,
pub children: SpinLock<BTreeMap<Tgid, Arc<ThreadGroup>>>,
pub tasks: SpinLock<BTreeMap<Tid, Weak<Task>>>,
pub tasks: SpinLock<BTreeMap<Tid, Weak<Work>>>,
pub signals: Arc<SpinLock<SignalActionState>>,
pub rsrc_lim: Arc<SpinLock<ResourceLimits>>,
pub pending_signals: SpinLock<SigSet>,
@@ -165,13 +172,10 @@ impl ThreadGroup {
*self.pending_signals.lock_save_irq() = SigSet::SIGKILL;
for task in self.tasks.lock_save_irq().values() {
if let Some(task) = task.upgrade()
&& matches!(
*task.state.lock_save_irq(),
TaskState::Stopped | TaskState::Sleeping
)
{
create_waker(task.descriptor()).wake();
if let Some(task) = task.upgrade() {
// Wake will handle Sleeping/Stopped → Enqueue,
// and Running/Pending* → PreventedSleep (sets Woken).
create_waker(task).wake();
}
}
}
@@ -182,8 +186,12 @@ impl ThreadGroup {
for task in self.tasks.lock_save_irq().values() {
if let Some(task) = task.upgrade()
&& matches!(
*task.state.lock_save_irq(),
TaskState::Runnable | TaskState::Running
task.state.load(Ordering::Acquire),
TaskState::Runnable
| TaskState::Running
| TaskState::Woken
| TaskState::PendingSleep
| TaskState::PendingStop
)
{
// Signal delivered. This task will eventually be
@@ -196,7 +204,7 @@ impl ThreadGroup {
// No task will pick up the signal. Wake one up.
for task in self.tasks.lock_save_irq().values() {
if let Some(task) = task.upgrade() {
create_waker(task.descriptor()).wake();
create_waker(task).wake();
return;
}
}