mirror of
https://github.com/hexagonal-sun/moss-kernel.git
synced 2026-04-20 15:18:26 -04:00
sched: introduce Work as the unified scheduleable unit
Refactor the scheduler so all scheduleable work is wrapped in Arc<Work>, replacing the previous per-CPU wait_q design where sleeping tasks were bound to a specific CPU. Wakers now hold direct Arc<Work> references and can re-enqueue tasks on any CPU upon wakeup. Key changes: - Add Work struct wrapping OwnedTask with an AtomicTaskState and scheduler metadata (SchedulerData), replacing the old SchedulableTask. Remove Task::state (Arc<SpinLock<TaskState>>). Work::state is now the single source of truth for task state. - Rewrite the run queue using BinaryHeap-based eligible/ineligible split (EEVDF) with a dedicated VClock, replacing the BTreeMap linear scan. Extract vclock into its own module. - Rewrite wakers to hold Arc<Work> directly instead of looking up tasks by TaskDescriptor from TASK_LIST. - Replace lock-based sleep transitions in uspc_ret with atomic CAS (try_sleep_current) that correctly detects concurrent Woken state. - Simplify least-tasked-CPU metric to use only run-queue weight, since sleeping tasks are no longer bound to any CPU. - Add current_work() accessor.
This commit is contained in:
@@ -60,6 +60,7 @@ pub async fn sys_capget(hdrp: TUA<CapUserHeader>, datap: TUA<CapUserData>) -> Re
|
||||
.iter()
|
||||
.find(|task| task.0.tgid.value() == header.pid as u32)
|
||||
.and_then(|task| task.1.upgrade())
|
||||
.map(|x| x.t_shared.clone())
|
||||
.ok_or(KernelError::NoProcess)?
|
||||
};
|
||||
match header.version {
|
||||
@@ -95,6 +96,7 @@ pub async fn sys_capset(hdrp: TUA<CapUserHeader>, datap: TUA<CapUserData>) -> Re
|
||||
.iter()
|
||||
.find(|task| task.0.tgid.value() == header.pid as u32)
|
||||
.and_then(|task| task.1.upgrade())
|
||||
.map(|x| x.t_shared.clone())
|
||||
.ok_or(KernelError::NoProcess)?
|
||||
};
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use super::owned::OwnedTask;
|
||||
use super::ptrace::{PTrace, TracePoint, ptrace_stop};
|
||||
use super::{ctx::Context, thread_group::signal::SigSet};
|
||||
use crate::kernel::cpu_id::CpuId;
|
||||
use crate::memory::uaccess::copy_to_user;
|
||||
use crate::sched::sched_task::Work;
|
||||
use crate::{
|
||||
process::{TASK_LIST, Task, TaskState},
|
||||
process::{TASK_LIST, Task},
|
||||
sched::{self, current::current_task},
|
||||
sync::SpinLock,
|
||||
};
|
||||
@@ -170,8 +170,6 @@ pub async fn sys_clone(
|
||||
cwd,
|
||||
root,
|
||||
creds: SpinLock::new(creds),
|
||||
state: Arc::new(SpinLock::new(TaskState::Runnable)),
|
||||
last_cpu: SpinLock::new(CpuId::this()),
|
||||
ptrace: SpinLock::new(ptrace),
|
||||
utime: AtomicUsize::new(0),
|
||||
stime: AtomicUsize::new(0),
|
||||
@@ -181,28 +179,29 @@ pub async fn sys_clone(
|
||||
}
|
||||
};
|
||||
|
||||
let tid = new_task.tid;
|
||||
let desc = new_task.descriptor();
|
||||
let work = Work::new(Box::new(new_task));
|
||||
|
||||
TASK_LIST
|
||||
.lock_save_irq()
|
||||
.insert(new_task.descriptor(), Arc::downgrade(&new_task.t_shared));
|
||||
.insert(desc, Arc::downgrade(&work));
|
||||
|
||||
new_task
|
||||
.process
|
||||
work.process
|
||||
.tasks
|
||||
.lock_save_irq()
|
||||
.insert(tid, Arc::downgrade(&new_task.t_shared));
|
||||
.insert(desc.tid, Arc::downgrade(&work));
|
||||
|
||||
sched::insert_task_cross_cpu(work);
|
||||
|
||||
sched::insert_task_cross_cpu(Box::new(new_task));
|
||||
NUM_FORKS.fetch_add(1, core::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
// Honour CLONE_*SETTID semantics for the parent and (shared-VM) child.
|
||||
if flags.contains(CloneFlags::CLONE_PARENT_SETTID) && !parent_tidptr.is_null() {
|
||||
copy_to_user(parent_tidptr, tid.value()).await?;
|
||||
copy_to_user(parent_tidptr, desc.tid.value()).await?;
|
||||
}
|
||||
if flags.contains(CloneFlags::CLONE_CHILD_SETTID) && !child_tidptr.is_null() {
|
||||
copy_to_user(child_tidptr, tid.value()).await?;
|
||||
copy_to_user(child_tidptr, desc.tid.value()).await?;
|
||||
}
|
||||
|
||||
Ok(tid.value() as _)
|
||||
Ok(desc.tid.value() as _)
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use super::{
|
||||
TASK_LIST, TaskState,
|
||||
TASK_LIST,
|
||||
ptrace::{TracePoint, ptrace_stop},
|
||||
thread_group::{ProcessState, Tgid, ThreadGroup, signal::SigId, wait::ChildState},
|
||||
threading::futex::{self, key::FutexKey},
|
||||
};
|
||||
use crate::sched::current::current_task;
|
||||
use crate::sched::{self, current::current_task};
|
||||
use crate::{memory::uaccess::copy_to_user, sched::current::current_task_shared};
|
||||
use alloc::vec::Vec;
|
||||
use libkernel::error::Result;
|
||||
@@ -34,7 +34,8 @@ pub fn do_exit_group(exit_code: ChildState) {
|
||||
if *process_state != ProcessState::Running {
|
||||
// We're already on our way out. Just kill this thread.
|
||||
drop(process_state);
|
||||
*task.state.lock_save_irq() = TaskState::Finished;
|
||||
drop(task);
|
||||
sched::current_work().state.finish();
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -51,7 +52,7 @@ pub fn do_exit_group(exit_code: ChildState) {
|
||||
// TODO: Send an IPI/Signal to halt execution now. For now, just
|
||||
// wait for the scheduler to never schedule any of it's tasks
|
||||
// again.
|
||||
*other_thread.state.lock_save_irq() = TaskState::Finished;
|
||||
other_thread.state.finish();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -87,7 +88,8 @@ pub fn do_exit_group(exit_code: ChildState) {
|
||||
.set_signal(SigId::SIGCHLD);
|
||||
|
||||
// 5. This thread is now finished.
|
||||
*task.state.lock_save_irq() = TaskState::Finished;
|
||||
drop(task);
|
||||
sched::current_work().state.finish();
|
||||
|
||||
// NOTE: that the scheduler will never execute the task again since it's
|
||||
// state is set to Finished.
|
||||
@@ -151,7 +153,7 @@ pub async fn sys_exit(exit_code: usize) -> Result<usize> {
|
||||
Ok(0)
|
||||
} else {
|
||||
// Mark our own state as finished.
|
||||
*task.state.lock_save_irq() = TaskState::Finished;
|
||||
sched::current_work().state.finish();
|
||||
|
||||
// Remove ourself from the process's thread list.
|
||||
tasks_lock.remove(&task.tid);
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::drivers::timer::Instant;
|
||||
use crate::sched::CPU_STAT;
|
||||
use crate::sched::sched_task::Work;
|
||||
use crate::{
|
||||
arch::ArchImpl,
|
||||
kernel::cpu_id::CpuId,
|
||||
@@ -14,7 +15,6 @@ use alloc::{
|
||||
collections::btree_map::BTreeMap,
|
||||
sync::{Arc, Weak},
|
||||
};
|
||||
use core::fmt::Display;
|
||||
use core::sync::atomic::{AtomicUsize, Ordering};
|
||||
use creds::Credentials;
|
||||
use fd_table::FileDescriptorTable;
|
||||
@@ -124,35 +124,6 @@ impl TaskDescriptor {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TaskState {
|
||||
Running,
|
||||
Runnable,
|
||||
Woken,
|
||||
Stopped,
|
||||
Sleeping,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl Display for TaskState {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
let state_str = match self {
|
||||
TaskState::Running => "R",
|
||||
TaskState::Runnable => "R",
|
||||
TaskState::Woken => "W",
|
||||
TaskState::Stopped => "T",
|
||||
TaskState::Sleeping => "S",
|
||||
TaskState::Finished => "Z",
|
||||
};
|
||||
write!(f, "{state_str}")
|
||||
}
|
||||
}
|
||||
|
||||
impl TaskState {
|
||||
pub fn is_finished(self) -> bool {
|
||||
matches!(self, Self::Finished)
|
||||
}
|
||||
}
|
||||
pub type ProcVM = ProcessVM<<ArchImpl as VirtualMemory>::ProcessAddressSpace>;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
@@ -184,8 +155,6 @@ pub struct Task {
|
||||
pub root: Arc<SpinLock<(Arc<dyn Inode>, PathBuf)>>,
|
||||
pub creds: SpinLock<Credentials>,
|
||||
pub fd_table: Arc<SpinLock<FileDescriptorTable>>,
|
||||
pub state: Arc<SpinLock<TaskState>>,
|
||||
pub last_cpu: SpinLock<CpuId>,
|
||||
pub ptrace: SpinLock<PTrace>,
|
||||
pub utime: AtomicUsize,
|
||||
pub stime: AtomicUsize,
|
||||
@@ -308,7 +277,7 @@ impl Task {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_task_by_descriptor(descriptor: &TaskDescriptor) -> Option<Arc<Task>> {
|
||||
pub fn find_task_by_descriptor(descriptor: &TaskDescriptor) -> Option<Arc<Work>> {
|
||||
TASK_LIST
|
||||
.lock_save_irq()
|
||||
.get(descriptor)
|
||||
@@ -316,11 +285,11 @@ pub fn find_task_by_descriptor(descriptor: &TaskDescriptor) -> Option<Arc<Task>>
|
||||
}
|
||||
|
||||
/// Finds the root task for the given thread group
|
||||
pub fn find_process_by_tgid(tgid: Tgid) -> Option<Arc<Task>> {
|
||||
pub fn find_process_by_tgid(tgid: Tgid) -> Option<Arc<Work>> {
|
||||
find_task_by_descriptor(&TaskDescriptor::from_tgid_tid(tgid, Tid::from_tgid(tgid)))
|
||||
}
|
||||
|
||||
pub static TASK_LIST: SpinLock<BTreeMap<TaskDescriptor, Weak<Task>>> =
|
||||
pub static TASK_LIST: SpinLock<BTreeMap<TaskDescriptor, Weak<Work>>> =
|
||||
SpinLock::new(BTreeMap::new());
|
||||
|
||||
unsafe impl Send for Task {}
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use core::ops::Deref;
|
||||
|
||||
use super::{
|
||||
Comm, Task, TaskState, Tid,
|
||||
Comm, Task, Tid,
|
||||
creds::Credentials,
|
||||
ctx::{Context, UserCtx},
|
||||
fd_table::FileDescriptorTable,
|
||||
@@ -13,14 +11,13 @@ use super::{
|
||||
},
|
||||
threading::RobustListHead,
|
||||
};
|
||||
use crate::drivers::timer::{Instant, now};
|
||||
use crate::{arch::Arch, fs::DummyInode, sync::SpinLock};
|
||||
use crate::{
|
||||
arch::{Arch, ArchImpl},
|
||||
fs::DummyInode,
|
||||
kernel::cpu_id::CpuId,
|
||||
sync::SpinLock,
|
||||
arch::ArchImpl,
|
||||
drivers::timer::{Instant, now},
|
||||
};
|
||||
use alloc::sync::Arc;
|
||||
use core::ops::Deref;
|
||||
use core::sync::atomic::AtomicUsize;
|
||||
use libkernel::{
|
||||
VirtualMemory,
|
||||
@@ -72,13 +69,11 @@ impl OwnedTask {
|
||||
tid: Tid::idle_for_cpu(),
|
||||
comm: Arc::new(SpinLock::new(Comm::new("idle"))),
|
||||
process: thread_group_builder.build(),
|
||||
state: Arc::new(SpinLock::new(TaskState::Runnable)),
|
||||
cwd: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
|
||||
root: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
|
||||
creds: SpinLock::new(Credentials::new_root()),
|
||||
vm: Arc::new(SpinLock::new(vm)),
|
||||
fd_table: Arc::new(SpinLock::new(FileDescriptorTable::new())),
|
||||
last_cpu: SpinLock::new(CpuId::this()),
|
||||
ptrace: SpinLock::new(PTrace::new()),
|
||||
utime: AtomicUsize::new(0),
|
||||
stime: AtomicUsize::new(0),
|
||||
@@ -102,7 +97,6 @@ impl OwnedTask {
|
||||
tid: Tid(1),
|
||||
comm: Arc::new(SpinLock::new(Comm::new("init"))),
|
||||
process: ThreadGroupBuilder::new(Tgid::init()).build(),
|
||||
state: Arc::new(SpinLock::new(TaskState::Runnable)),
|
||||
cwd: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
|
||||
root: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))),
|
||||
creds: SpinLock::new(Credentials::new_root()),
|
||||
@@ -110,7 +104,6 @@ impl OwnedTask {
|
||||
ProcessVM::empty().expect("Could not create init process's VM"),
|
||||
)),
|
||||
fd_table: Arc::new(SpinLock::new(FileDescriptorTable::new())),
|
||||
last_cpu: SpinLock::new(CpuId::this()),
|
||||
ptrace: SpinLock::new(PTrace::new()),
|
||||
last_account: AtomicUsize::new(0),
|
||||
utime: AtomicUsize::new(0),
|
||||
|
||||
@@ -1,24 +1,25 @@
|
||||
use core::future::poll_fn;
|
||||
use core::task::{Poll, Waker};
|
||||
|
||||
use crate::arch::{Arch, ArchImpl};
|
||||
use crate::fs::syscalls::iov::IoVec;
|
||||
use crate::memory::uaccess::{copy_from_user, copy_to_user};
|
||||
use crate::process::TASK_LIST;
|
||||
use crate::process::thread_group::signal::SigId;
|
||||
use crate::sched::current::{current_task, current_task_shared};
|
||||
use super::thread_group::{ThreadGroup, wait::ChildState};
|
||||
use crate::{
|
||||
arch::{Arch, ArchImpl},
|
||||
fs::syscalls::iov::IoVec,
|
||||
memory::uaccess::{copy_from_user, copy_to_user},
|
||||
process::{TASK_LIST, thread_group::signal::SigId},
|
||||
sched::current::{current_task, current_task_shared},
|
||||
};
|
||||
use alloc::sync::Arc;
|
||||
use bitflags::Flags;
|
||||
use libkernel::error::{KernelError, Result};
|
||||
use libkernel::memory::address::UA;
|
||||
use core::{
|
||||
future::poll_fn,
|
||||
task::{Poll, Waker},
|
||||
};
|
||||
use libkernel::{
|
||||
error::{KernelError, Result},
|
||||
memory::address::UA,
|
||||
};
|
||||
use log::warn;
|
||||
|
||||
type GpRegs = <ArchImpl as Arch>::PTraceGpRegs;
|
||||
|
||||
use super::TaskState;
|
||||
use super::thread_group::ThreadGroup;
|
||||
use super::thread_group::wait::ChildState;
|
||||
|
||||
const PTRACE_EVENT_FORK: usize = 1;
|
||||
const PTRACE_EVENT_VFORK: usize = 2;
|
||||
const PTRACE_EVENT_CLONE: usize = 3;
|
||||
@@ -43,7 +44,7 @@ bitflags::bitflags! {
|
||||
const PTRACE_O_SUSPEND_SECCOMP = 1 << 21;
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
#[derive(Clone, Copy, PartialEq, Debug)]
|
||||
pub struct TracePoint: u32 {
|
||||
const SyscallEntry = 0x01;
|
||||
const SyscallExit = 0x02;
|
||||
@@ -177,9 +178,6 @@ impl PTrace {
|
||||
}
|
||||
|
||||
pub fn set_waker(&mut self, waker: Waker) {
|
||||
// Ensure we never override an already existing waker.
|
||||
debug_assert!(self.waker.is_none());
|
||||
|
||||
self.waker = Some(waker);
|
||||
}
|
||||
|
||||
@@ -259,22 +257,29 @@ impl TryFrom<i32> for PtraceOperation {
|
||||
|
||||
pub async fn ptrace_stop(point: TracePoint) -> bool {
|
||||
let task_sh = current_task_shared();
|
||||
{
|
||||
let mut ptrace = task_sh.ptrace.lock_save_irq();
|
||||
|
||||
if ptrace.hit_trace_point(point, current_task().ctx.user()) {
|
||||
ptrace.notify_tracer_of_trap(&task_sh.process);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
let mut notified = false;
|
||||
|
||||
poll_fn(|cx| {
|
||||
let mut ptrace = task_sh.ptrace.lock_save_irq();
|
||||
|
||||
if matches!(ptrace.state, Some(PTraceState::Running)) {
|
||||
if !notified {
|
||||
// First poll: hit the trace point, set waker, then notify.
|
||||
// The waker must be set *before* notification so the tracer
|
||||
// can always find it when it does PTRACE_SYSCALL/CONT.
|
||||
if !ptrace.hit_trace_point(point, current_task().ctx.user()) {
|
||||
return Poll::Ready(false);
|
||||
}
|
||||
|
||||
notified = true;
|
||||
ptrace.set_waker(cx.waker().clone());
|
||||
ptrace.notify_tracer_of_trap(&task_sh.process);
|
||||
Poll::Pending
|
||||
} else if matches!(ptrace.state, Some(PTraceState::Running)) {
|
||||
// Tracer resumed us.
|
||||
Poll::Ready(true)
|
||||
} else {
|
||||
// Re-polled (e.g. spurious wakeup from signal) but tracer
|
||||
// hasn't resumed yet. Refresh the waker and go back to sleep.
|
||||
ptrace.set_waker(cx.waker().clone());
|
||||
Poll::Pending
|
||||
}
|
||||
@@ -381,7 +386,9 @@ pub async fn sys_ptrace(op: i32, pid: u64, addr: UA, data: UA) -> Result<usize>
|
||||
.break_points
|
||||
.remove(TracePoint::SyscallEntry | TracePoint::SyscallExit);
|
||||
|
||||
*target_task.state.lock_save_irq() = TaskState::Runnable;
|
||||
if let Some(waker) = ptrace.waker.take() {
|
||||
waker.wake();
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
use super::{Task, TaskState, Tid};
|
||||
use crate::{memory::uaccess::UserCopyable, sched::waker::create_waker, sync::SpinLock};
|
||||
use super::Tid;
|
||||
use crate::{
|
||||
memory::uaccess::UserCopyable,
|
||||
sched::{
|
||||
sched_task::{Work, state::TaskState},
|
||||
waker::create_waker,
|
||||
},
|
||||
sync::SpinLock,
|
||||
};
|
||||
use alloc::{
|
||||
collections::btree_map::BTreeMap,
|
||||
sync::{Arc, Weak},
|
||||
@@ -95,7 +102,7 @@ pub struct ThreadGroup {
|
||||
pub umask: SpinLock<u32>,
|
||||
pub parent: SpinLock<Option<Weak<ThreadGroup>>>,
|
||||
pub children: SpinLock<BTreeMap<Tgid, Arc<ThreadGroup>>>,
|
||||
pub tasks: SpinLock<BTreeMap<Tid, Weak<Task>>>,
|
||||
pub tasks: SpinLock<BTreeMap<Tid, Weak<Work>>>,
|
||||
pub signals: Arc<SpinLock<SignalActionState>>,
|
||||
pub rsrc_lim: Arc<SpinLock<ResourceLimits>>,
|
||||
pub pending_signals: SpinLock<SigSet>,
|
||||
@@ -165,13 +172,10 @@ impl ThreadGroup {
|
||||
*self.pending_signals.lock_save_irq() = SigSet::SIGKILL;
|
||||
|
||||
for task in self.tasks.lock_save_irq().values() {
|
||||
if let Some(task) = task.upgrade()
|
||||
&& matches!(
|
||||
*task.state.lock_save_irq(),
|
||||
TaskState::Stopped | TaskState::Sleeping
|
||||
)
|
||||
{
|
||||
create_waker(task.descriptor()).wake();
|
||||
if let Some(task) = task.upgrade() {
|
||||
// Wake will handle Sleeping/Stopped → Enqueue,
|
||||
// and Running/Pending* → PreventedSleep (sets Woken).
|
||||
create_waker(task).wake();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -182,8 +186,12 @@ impl ThreadGroup {
|
||||
for task in self.tasks.lock_save_irq().values() {
|
||||
if let Some(task) = task.upgrade()
|
||||
&& matches!(
|
||||
*task.state.lock_save_irq(),
|
||||
TaskState::Runnable | TaskState::Running
|
||||
task.state.load(Ordering::Acquire),
|
||||
TaskState::Runnable
|
||||
| TaskState::Running
|
||||
| TaskState::Woken
|
||||
| TaskState::PendingSleep
|
||||
| TaskState::PendingStop
|
||||
)
|
||||
{
|
||||
// Signal delivered. This task will eventually be
|
||||
@@ -196,7 +204,7 @@ impl ThreadGroup {
|
||||
// No task will pick up the signal. Wake one up.
|
||||
for task in self.tasks.lock_save_irq().values() {
|
||||
if let Some(task) = task.upgrade() {
|
||||
create_waker(task.descriptor()).wake();
|
||||
create_waker(task).wake();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user