ostd/task/scheduler/
mod.rs

1// SPDX-License-Identifier: MPL-2.0
2
3//! Task scheduling.
4//!
5//! # Scheduler Injection
6//!
7//! The task scheduler of an OS is a complex beast,
8//! and the most suitable scheduling algorithm often depends on the target usage scenario.
9//! To avoid code bloat and offer flexibility,
10//! OSTD does not include a gigantic, one-size-fits-all task scheduler.
11//! Instead, it allows the client to implement a custom scheduler (in safe Rust, of course)
12//! and register it with OSTD.
13//! This feature is known as **scheduler injection**.
14//!
15//! The client kernel performs scheduler injection via the [`inject_scheduler`] API.
16//! This API should be called as early as possible during kernel initialization,
17//! before any [`Task`]-related APIs are used.
18//! This requirement is reasonable since `Task`s depend on the scheduler.
19//!
20//! # Scheduler Abstraction
21//!
22//! The `inject_scheduler` API accepts an object implementing the [`Scheduler`] trait,
23//! which abstracts over any SMP-aware task scheduler.
24//! Whenever an OSTD client spawns a new task (via [`crate::task::TaskOptions`])
25//! or wakes a sleeping task (e.g., via [`crate::sync::Waker`]),
26//! OSTD internally forwards the corresponding `Arc<Task>`
27//! to the scheduler by invoking the [`Scheduler::enqueue`] method.
28//! This allows the injected scheduler to manage all runnable tasks.
29//!
30//! Each enqueued task is dispatched to one of the per-CPU local runqueues,
31//! which manage all runnable tasks on a specific CPU.
32//! A local runqueue is abstracted by the [`LocalRunQueue`] trait.
33//! OSTD accesses the local runqueue of the current CPU
34//! via [`Scheduler::local_rq_with`] or [`Scheduler::mut_local_rq_with`],
35//! which return immutable and mutable references to `dyn LocalRunQueue`, respectively.
36//!
37//! The [`LocalRunQueue`] trait enables OSTD to inspect and manipulate local runqueues.
38//! For instance, OSTD invokes the [`LocalRunQueue::pick_next`] method
39//! to let the scheduler select the next task to run.
40//! OSTD then performs a context switch to that task,
41//! which becomes the _current_ running task, accessible via [`LocalRunQueue::current`].
42//! When the current task is about to sleep (e.g., via [`crate::sync::Waiter`]),
43//! OSTD removes it from the local runqueue using [`LocalRunQueue::dequeue_current`].
44//!
45//! The interfaces of `Scheduler` and `LocalRunQueue` are simple
46//! yet (perhaps surprisingly) powerful enough to support
47//! even complex and advanced task scheduler implementations.
48//! Scheduler implementations are free to employ any load-balancing strategy
49//! to dispatch enqueued tasks across local runqueues,
50//! and each local runqueue is free to choose any prioritization strategy
51//! for selecting the next task to run.
52//! Based on OSTD's scheduling abstractions,
53//! the Asterinas kernel has successfully supported multiple Linux scheduling classes,
54//! including both real-time and normal policies.
55//!
56//! # Safety Impact
57//!
58//! While OSTD delegates scheduling decisions to the injected task scheduler,
59//! it verifies these decisions to avoid undefined behavior.
60//! In particular, it enforces the following safety invariant:
61//!
62//! > A task must not be scheduled to run on more than one CPU at a time.
63//!
64//! Violating this invariant—e.g., running the same task on two CPUs concurrently—
65//! can have catastrophic consequences,
66//! as the task's stack and internal state may be corrupted by concurrent modifications.
67
68mod fifo_scheduler;
69pub mod info;
70
71use spin::Once;
72
73use super::{Task, preempt::cpu_local, processor};
74use crate::{
75    cpu::{CpuId, CpuSet, PinCurrentCpu},
76    prelude::*,
77    task::disable_preempt,
78    timer,
79};
80
81/// Injects a custom implementation of task scheduler into OSTD.
82///
83/// This function can only be called once and must be called
84/// during the initialization phase of kernel,
85/// before any [`Task`]-related APIs are invoked.
86///
87/// # Panics
88///
89/// This function panics if a scheduler has already been injected,
90/// either explicitly via a previous call to this function
91/// or implicitly via the lazy default FIFO scheduler.
92pub fn inject_scheduler(scheduler: &'static dyn Scheduler<Task>) {
93    let mut is_new = false;
94    SCHEDULER.call_once(|| {
95        is_new = true;
96        scheduler
97    });
98    assert!(is_new, "a scheduler has already been initialized");
99}
100
101/// Enables preemptive scheduling on the current CPU.
102///
103/// After calling this function on a CPU,
104/// a task that is executing in the user mode may get preempted
105/// if another runnable task on the same CPU is deemed more urgent by the scheduler.
106///
107/// OSTD achieves task preemption by registering a per-CPU timer callback
108/// to invoke the scheduler periodically.
109/// Thus, this function should be called _once_ on every CPU
110/// by an OSTD-based kernel during its initialization phase,
111/// after it has injected its scheduler via [`inject_scheduler`].
112pub fn enable_preemption_on_cpu() {
113    timer::register_callback_on_cpu(|| {
114        scheduler_singleton().mut_local_rq_with(&mut |local_rq| {
115            let should_pick_next = local_rq.update_current(UpdateFlags::Tick);
116            if should_pick_next {
117                cpu_local::set_need_preempt();
118            }
119        })
120    });
121}
122
123/// The global scheduler singleton.
124///
125/// Do not access this directly; it may not yet be initialized.
126/// Use [`scheduler_singleton`] instead, which returns the current scheduler
127/// and falls back to a default FIFO scheduler if none has been injected.
128static SCHEDULER: Once<&'static dyn Scheduler<Task>> = Once::new();
129
130/// Returns the global scheduler.
131///
132/// If a scheduler has already been injected (e.g., by a custom scheduler),
133/// it is returned directly. Otherwise, a default FIFO scheduler
134/// is lazily initialized and used.
135///
136/// Initialization is atomic: [`Once::call_once`] guarantees that even if
137/// multiple CPUs call this concurrently, only one will execute the
138/// initialization closure while the others block until it completes.
139fn scheduler_singleton() -> &'static dyn Scheduler<Task> {
140    *SCHEDULER.call_once(|| fifo_scheduler::new_instance())
141}
142
143/// A SMP-aware task scheduler.
144pub trait Scheduler<T = Task>: Sync + Send {
145    /// Enqueues a runnable task.
146    ///
147    /// The scheduler implementer can perform load-balancing or some time accounting work here.
148    ///
149    /// The newly-enqueued task may have a higher priority than the currently running one on a CPU
150    /// and thus should preempt the latter.
151    /// In this case, this method returns the ID of that CPU.
152    fn enqueue(&self, runnable: Arc<T>, flags: EnqueueFlags) -> Option<CpuId>;
153
154    /// Gets an immutable access to the local runqueue of the current CPU.
155    fn local_rq_with(&self, f: &mut dyn FnMut(&dyn LocalRunQueue<T>));
156
157    /// Gets a mutable access to the local runqueue of the current CPU.
158    fn mut_local_rq_with(&self, f: &mut dyn FnMut(&mut dyn LocalRunQueue<T>));
159}
160
161/// A per-CPU, local runqueue.
162///
163/// This abstraction allows OSTD to inspect and manipulate local runqueues.
164///
165/// Conceptually, a local runqueue maintains:
166/// 1. A priority queue of runnable tasks.
167///    The definition of "priority" is left to the concrete implementation.
168/// 2. The current running task.
169///
170/// # Interactions with OSTD
171///
172/// ## Overview
173///
174/// It is crucial for implementers of `LocalRunQueue`
175/// to understand how OSTD interacts with local runqueues.
176///
177/// A local runqueue is consulted by OSTD in response to one of four scheduling events:
178/// - **Yielding**, triggered by [`Task::yield_now`], where the current task voluntarily gives up CPU time.
179/// - **Sleeping**, triggered by [`crate::sync::Waiter::wait`]
180///   or any synchronization primitive built upon it (e.g., [`crate::sync::WaitQueue`], [`crate::sync::Mutex`]),
181///   which blocks the current task until a wake-up event occurs.
182/// - **Ticking**, triggered periodically by the system timer
183///   (see [`crate::timer::TIMER_FREQ`]),
184///   which provides an opportunity to do time accounting and consider preemption.
185/// - **Exiting**, triggered when the execution logic of a task has come to an end,
186///   which informs the scheduler that the task is exiting and will never be enqueued again.
187///
188/// The general workflow for OSTD to handle a scheduling event is as follows:
189/// 1. Acquire exclusive access to the local runqueue using [`Scheduler::mut_local_rq_with`].
190/// 2. Call [`LocalRunQueue::update_current`] to update the current task's state,
191///    returning a boolean value that indicates
192///    whether the current task should and can be replaced with another runnable task.
193/// 3. If the task is about to sleep or exit, call [`LocalRunQueue::dequeue_current`]
194///    to remove it from the runqueue.
195/// 4. If the return value of `update_current` in Step 2 is true,
196///    then select the next task to run with [`LocalRunQueue::pick_next`].
197///
198/// ## When to Pick the Next Task?
199///
200/// As shown above,
201/// OSTD guarantees that `pick_next` is only called
202/// when the current task should and can be replaced.
203/// This avoids unnecessary invocations and improves efficiency.
204///
205/// But under what conditions should the current task be replaced?
206/// Two criteria must be met:
207/// 1. There exists at least one other runnable task in the runqueue.
208/// 2. That task should preempt the current one, if present.
209///
210/// Some implications of these rules:
211/// - If the runqueue is empty, `update_current` must return `false`—there's nothing to run.
212/// - If the runqueue is non-empty but the current task is absent,
213///   `update_current` should return `true`—anything is better than nothing.
214/// - If the runqueue is non-empty and the flag is `UpdateFlags::WAIT`,
215///   `update_current` should also return `true`,
216///   because the current task is about to block.
217/// - In other cases, the return value depends on the scheduler's prioritization policy.
218///   For instance, a real-time task may only be preempted by a higher-priority task
219///   or if it explicitly yields.
220///   A normal task under Linux's CFS may be preempted by a task with smaller vruntime,
221///   but never by the idle task.
222///
223/// When OSTD is unsure about whether the current task should or can be replaced,
224/// it will invoke [`LocalRunQueue::try_pick_next`], the fallible version of `pick_next`.
225///
226/// ## Internal Working
227///
228/// To guide scheduler implementers,
229/// we provide a simplified view of how OSTD interacts with local runqueues _internally_
230/// in order to handle the four scheduling events.
231///
232/// ### Yielding
233///
234/// ```
235/// # use ostd::prelude::*;
236/// # use ostd::task::{*, scheduler::*};
237/// #
238/// # fn switch_to(next: Arc<Task>) {}
239/// #
240/// /// Yields the current task.
241/// fn yield(scheduler: &'static dyn Scheduler) {
242///     let next_task_opt: Option<Arc<Task>> = scheduler.mut_local_rq_with(|local_rq| {
243///         let should_pick_next = local_rq.update_current(UpdateFlags::Yield);
244///         should_pick_next.then(|| local_rq.pick_next().clone())
245///     });
246///     let Some(next_task) = next_task_opt {
247///         switch_to(next_task);
248///     }
249/// }
250/// ```
251///
252/// ### Sleeping
253///
254/// ```
255/// # use ostd::prelude::*;
256/// # use ostd::task::{*, scheduler::*};
257/// #
258/// # fn switch_to(next: Arc<Task>) {}
259/// #
260/// /// Puts the current task to sleep.
261/// ///
262/// /// The function takes a closure to check if the task is woken.
263/// /// This function is used internally to guard against race conditions,
264/// /// where the task is woken just before it goes to sleep.
265/// fn sleep<F: Fn() -> bool>(scheduler: &'static dyn Scheduler, is_woken: F) {
266///     let mut next_task_opt: Option<Arc<Task>> = None;
267///     let mut is_first_try = true;
268///     while scheduler.mut_local_rq_with(|local_rq| {
269///         if is_first_try {
270///             if is_woken() {
271///                 return false; // exit loop
272///             }
273///             is_first_try = false;
274///
275///             let should_pick_next = local_rq.update_current(UpdateFlags::Wait);
276///             let _current = local_rq.dequeue_current();
277///             if !should_pick_next {
278///                 return true; // continue loop
279///             }
280///             next_task_opt = Some(local_rq.pick_next().clone());
281///             false // exit loop
282///         } else {
283///             next_task_opt = local_rq.try_pick_next().cloned();
284///             next_task_opt.is_none()
285///         }
286///     }) {}
287///     let Some(next_task) = next_task_opt {
288///         switch_to(next_task);
289///     }
290/// }
291/// ```
292///
293/// ### Ticking
294///
295/// ```
296/// # use ostd::prelude::*;
297/// # use ostd::task::{*, scheduler::*};
298/// #
299/// # fn switch_to(next: Arc<Task>) {}
300/// # mod cpu_local {
301/// #     fn set_need_preempt();
302/// #     fn should_preempt() -> bool;
303/// # }
304/// #
305/// /// A callback to be invoked periodically by the timer interrupt.
306/// fn on_tick(scheduler: &'static dyn Scheduler) {
307///     scheduler.mut_local_rq_with(|local_rq| {
308///         let should_pick_next = local_rq.update_current(UpdateFlags::Tick);
309///         if should_pick_next {
310///             cpu_local::set_need_preempt();
311///         }
312///     });
313/// }
314///
315/// /// A preemption point, called at an earliest convenient timing
316/// /// when OSTD can safely preempt the current running task.
317/// fn might_preempt(scheduler: &'static dyn Scheduler) {
318///     if !cpu_local::should_preempt() {
319///         return;
320///     }
321///     let next_task_opt: Option<Arc<Task>> = scheduler
322///         .mut_local_rq_with(|local_rq| local_rq.try_pick_next().cloned())
323///     let Some(next_task) = next_task_opt {
324///         switch_to(next_task);
325///     }
326/// }
327/// ```
328///
329/// ### Exiting
330///
331/// ```
332/// # use ostd::prelude::*;
333/// # use ostd::task::{*, scheduler::*};
334/// #
335/// # fn switch_to(next: Arc<Task>) {}
336/// #
337/// /// Exits the current task.
338/// fn exit(scheduler: &'static dyn Scheduler) {
339///     let mut next_task_opt: Option<Arc<Task>> = None;
340///     let mut is_first_try = true;
341///     while scheduler.mut_local_rq_with(|local_rq| {
342///         if is_first_try {
343///             is_first_try = false;
344///             let should_pick_next = local_rq.update_current(UpdateFlags::Exit);
345///             let _current = local_rq.dequeue_current();
346///             if !should_pick_next {
347///                 return true; // continue loop
348///             }
349///             next_task_opt = Some(local_rq.pick_next().clone());
350///             false // exit loop
351///         } else {
352///             next_task_opt = local_rq.try_pick_next().cloned();
353///             next_task_opt.is_none()
354///         }
355///     }) {}
356///     let next_task = next_task_opt.unwrap();
357///     switch_to(next_task);
358/// }
359/// ```
360pub trait LocalRunQueue<T = Task> {
361    /// Gets the current runnable task.
362    fn current(&self) -> Option<&Arc<T>>;
363
364    /// Updates the current runnable task's scheduling statistics and
365    /// potentially its position in the runqueue.
366    ///
367    /// The return value of this method indicates whether an invocation of `pick_next` should be followed
368    /// to find another task to replace the current one.
369    #[must_use]
370    fn update_current(&mut self, flags: UpdateFlags) -> bool;
371
372    /// Picks the next runnable task.
373    ///
374    /// This method instructs the local runqueue to pick the next runnable task and replace the current one.
375    /// A reference to the new "current" task will be returned by this method.
376    /// If the "old" current task presents, then it is still runnable and thus remains in the runqueue.
377    ///
378    /// # Panics
379    ///
380    /// As explained in the type-level Rust doc,
381    /// this method will only be invoked by OSTD after a call to `update_current` returns true.
382    /// In case that this contract is broken by the caller,
383    /// the implementer is free to exhibit any undesirable or incorrect behaviors, include panicking.
384    fn pick_next(&mut self) -> &Arc<T> {
385        self.try_pick_next().unwrap()
386    }
387
388    /// Tries to pick the next runnable task.
389    ///
390    /// This method instructs the local runqueue to pick the next runnable task on a best-effort basis.
391    /// If such a task can be picked, then this task supersedes the current task and
392    /// the new the method returns a reference to the new "current" task.
393    /// If the "old" current task presents, then it is still runnable and thus remains in the runqueue.
394    fn try_pick_next(&mut self) -> Option<&Arc<T>>;
395
396    /// Removes the current runnable task from runqueue.
397    ///
398    /// This method returns the current runnable task.
399    /// If there is no current runnable task, this method returns `None`.
400    fn dequeue_current(&mut self) -> Option<Arc<T>>;
401}
402
403/// Possible triggers of an `enqueue` action.
404#[derive(PartialEq, Copy, Clone)]
405pub enum EnqueueFlags {
406    /// Spawn a new task.
407    Spawn,
408    /// Wake a sleeping task.
409    Wake,
410}
411
412/// Possible triggers of an `update_current` action.
413#[derive(PartialEq, Copy, Clone)]
414pub enum UpdateFlags {
415    /// Timer interrupt.
416    Tick,
417    /// Task waiting.
418    Wait,
419    /// Task yielding.
420    Yield,
421    /// Task exiting.
422    Exit,
423}
424
425/// Preempts the current task.
426#[track_caller]
427pub(crate) fn might_preempt() {
428    if !cpu_local::should_preempt() {
429        return;
430    }
431    reschedule(|local_rq| {
432        let next_task_opt = local_rq.try_pick_next();
433        if let Some(next_task) = next_task_opt {
434            ReschedAction::SwitchTo(next_task.clone())
435        } else {
436            ReschedAction::DoNothing
437        }
438    })
439}
440
441/// Blocks the current task unless `has_unparked()` returns `true`.
442///
443/// Note that this method may return due to spurious wake events. It's the caller's responsibility
444/// to detect them (if necessary).
445#[track_caller]
446pub(crate) fn park_current<F>(has_unparked: F)
447where
448    F: Fn() -> bool,
449{
450    let mut current = None;
451    let mut is_first_try = true;
452
453    reschedule(|local_rq: &mut dyn LocalRunQueue| {
454        let next_task_opt = if is_first_try {
455            if has_unparked() {
456                return ReschedAction::DoNothing;
457            }
458            is_first_try = false;
459
460            // Note the race conditions: the current task may be woken after the above `has_unparked`
461            // check, but before the below `dequeue_current` action, we need to make sure that the
462            // wakeup event isn't lost.
463            //
464            // Currently, for the FIFO and CFS scheduler, `Scheduler::enqueue` will try to lock `local_rq`
465            // when the above race condition occurs, so it will wait until we finish calling the
466            // `dequeue_current` method and nothing bad will happen. This may need to be revisited
467            // after more complex schedulers are introduced.
468
469            let should_pick_next = local_rq.update_current(UpdateFlags::Wait);
470            current = local_rq.dequeue_current();
471            should_pick_next.then(|| local_rq.pick_next())
472        } else {
473            local_rq.try_pick_next()
474        };
475
476        if let Some(next_task) = next_task_opt {
477            if Arc::ptr_eq(current.as_ref().unwrap(), next_task) {
478                // The current task has been woken and picked as the next runnable task.
479                return ReschedAction::DoNothing;
480            }
481            return ReschedAction::SwitchTo(next_task.clone());
482        }
483
484        ReschedAction::Retry
485    });
486}
487
488/// Unblocks a target task.
489pub(crate) fn unpark_target(runnable: Arc<Task>) {
490    let preempt_cpu = scheduler_singleton().enqueue(runnable, EnqueueFlags::Wake);
491    if let Some(preempt_cpu_id) = preempt_cpu {
492        set_need_preempt(preempt_cpu_id);
493    }
494}
495
496/// Enqueues a newly built task.
497///
498/// Note that the new task is not guaranteed to run at once.
499#[track_caller]
500pub(super) fn run_new_task(runnable: Arc<Task>) {
501    let preempt_cpu = scheduler_singleton().enqueue(runnable, EnqueueFlags::Spawn);
502    if let Some(preempt_cpu_id) = preempt_cpu {
503        set_need_preempt(preempt_cpu_id);
504    }
505
506    might_preempt();
507}
508
509fn set_need_preempt(cpu_id: CpuId) {
510    let preempt_guard = disable_preempt();
511
512    if preempt_guard.current_cpu() == cpu_id {
513        cpu_local::set_need_preempt();
514    } else {
515        crate::smp::inter_processor_call(&CpuSet::from(cpu_id), || {
516            cpu_local::set_need_preempt();
517        });
518    }
519}
520
521/// Dequeues the current task from its runqueue.
522///
523/// This should only be called if the current is to exit.
524#[track_caller]
525pub(super) fn exit_current() -> ! {
526    let mut is_first_try = true;
527
528    reschedule(|local_rq: &mut dyn LocalRunQueue| {
529        let next_task_opt = if is_first_try {
530            is_first_try = false;
531            let should_pick_next = local_rq.update_current(UpdateFlags::Exit);
532            let _current = local_rq.dequeue_current();
533            should_pick_next.then(|| local_rq.pick_next())
534        } else {
535            local_rq.try_pick_next()
536        };
537
538        if let Some(next_task) = next_task_opt {
539            ReschedAction::SwitchTo(next_task.clone())
540        } else {
541            ReschedAction::Retry
542        }
543    });
544
545    unreachable!()
546}
547
548/// Yields execution.
549#[track_caller]
550pub(super) fn yield_now() {
551    reschedule(|local_rq| {
552        let should_pick_next = local_rq.update_current(UpdateFlags::Yield);
553        let next_task_opt = should_pick_next.then(|| local_rq.pick_next());
554        if let Some(next_task) = next_task_opt {
555            ReschedAction::SwitchTo(next_task.clone())
556        } else {
557            ReschedAction::DoNothing
558        }
559    })
560}
561
562/// Do rescheduling by acting on the scheduling decision (`ReschedAction`) made by a
563/// user-given closure.
564///
565/// The closure makes the scheduling decision by taking the local runqueue has its input.
566#[track_caller]
567fn reschedule<F>(mut f: F)
568where
569    F: FnMut(&mut dyn LocalRunQueue) -> ReschedAction,
570{
571    // Even if the decision below is `DoNothing`, we should clear this flag. Meanwhile, to avoid
572    // race conditions, we should do this before making the decision.
573    cpu_local::clear_need_preempt();
574
575    let next_task = loop {
576        let mut action = ReschedAction::DoNothing;
577        scheduler_singleton().mut_local_rq_with(&mut |rq| {
578            action = f(rq);
579        });
580
581        match action {
582            ReschedAction::DoNothing => {
583                return;
584            }
585            ReschedAction::Retry => {
586                continue;
587            }
588            ReschedAction::SwitchTo(next_task) => {
589                break next_task;
590            }
591        };
592    };
593
594    // `switch_to_task` will spin if it finds that the next task is still running on some CPU core,
595    // which guarantees soundness regardless of the scheduler implementation.
596    //
597    // FIXME: The scheduler decision and context switching are not atomic, which can lead to some
598    // strange behavior even if the scheduler is implemented correctly. See "Problem 2" at
599    // <https://github.com/asterinas/asterinas/issues/1633> for details.
600    processor::switch_to_task(next_task);
601}
602
603/// Possible actions of a rescheduling.
604enum ReschedAction {
605    /// Keep running current task and do nothing.
606    DoNothing,
607    /// Loop until finding a task to swap out the current.
608    Retry,
609    /// Switch to target task.
610    SwitchTo(Arc<Task>),
611}