ostd/task/scheduler/
mod.rs

1// SPDX-License-Identifier: MPL-2.0
2
3//! Task scheduling.
4//!
5//! # Scheduler Injection
6//!
7//! The task scheduler of an OS is a complex beast,
8//! and the most suitable scheduling algorithm often depends on the target usage scenario.
9//! To avoid code bloat and offer flexibility,
10//! OSTD does not include a gigantic, one-size-fits-all task scheduler.
11//! Instead, it allows the client to implement a custom scheduler (in safe Rust, of course)
12//! and register it with OSTD.
13//! This feature is known as **scheduler injection**.
14//!
15//! The client kernel performs scheduler injection via the [`inject_scheduler`] API.
16//! This API should be called as early as possible during kernel initialization,
17//! before any [`Task`]-related APIs are used.
18//! This requirement is reasonable since `Task`s depend on the scheduler.
19//!
20//! # Scheduler Abstraction
21//!
22//! The `inject_scheduler` API accepts an object implementing the [`Scheduler`] trait,
23//! which abstracts over any SMP-aware task scheduler.
24//! Whenever an OSTD client spawns a new task (via [`crate::task::TaskOptions`])
25//! or wakes a sleeping task (e.g., via [`crate::sync::Waker`]),
26//! OSTD internally forwards the corresponding `Arc<Task>`
27//! to the scheduler by invoking the [`Scheduler::enqueue`] method.
28//! This allows the injected scheduler to manage all runnable tasks.
29//!
30//! Each enqueued task is dispatched to one of the per-CPU local runqueues,
31//! which manage all runnable tasks on a specific CPU.
32//! A local runqueue is abstracted by the [`LocalRunQueue`] trait.
33//! OSTD accesses the local runqueue of the current CPU
34//! via [`Scheduler::local_rq_with`] or [`Scheduler::mut_local_rq_with`],
35//! which return immutable and mutable references to `dyn LocalRunQueue`, respectively.
36//!
37//! The [`LocalRunQueue`] trait enables OSTD to inspect and manipulate local runqueues.
38//! For instance, OSTD invokes the [`LocalRunQueue::pick_next`] method
39//! to let the scheduler select the next task to run.
40//! OSTD then performs a context switch to that task,
41//! which becomes the _current_ running task, accessible via [`LocalRunQueue::current`].
42//! When the current task is about to sleep (e.g., via [`crate::sync::Waiter`]),
43//! OSTD removes it from the local runqueue using [`LocalRunQueue::dequeue_current`].
44//!
45//! The interfaces of `Scheduler` and `LocalRunQueue` are simple
46//! yet (perhaps surprisingly) powerful enough to support
47//! even complex and advanced task scheduler implementations.
48//! Scheduler implementations are free to employ any load-balancing strategy
49//! to dispatch enqueued tasks across local runqueues,
50//! and each local runqueue is free to choose any prioritization strategy
51//! for selecting the next task to run.
52//! Based on OSTD's scheduling abstractions,
53//! the Asterinas kernel has successfully supported multiple Linux scheduling classes,
54//! including both real-time and normal policies.
55//!
56//! # Safety Impact
57//!
58//! While OSTD delegates scheduling decisions to the injected task scheduler,
59//! it verifies these decisions to avoid undefined behavior.
60//! In particular, it enforces the following safety invariant:
61//!
62//! > A task must not be scheduled to run on more than one CPU at a time.
63//!
64//! Violating this invariant—e.g., running the same task on two CPUs concurrently—
65//! can have catastrophic consequences,
66//! as the task's stack and internal state may be corrupted by concurrent modifications.
67
68mod fifo_scheduler;
69pub mod info;
70
71use spin::Once;
72
73use super::{Task, preempt::cpu_local, processor};
74use crate::{
75    cpu::{CpuId, CpuSet, PinCurrentCpu},
76    prelude::*,
77    task::disable_preempt,
78    timer,
79};
80
81/// Injects a custom implementation of task scheduler into OSTD.
82///
83/// This function can only be called once and must be called during the initialization phase of kernel,
84/// before any [`Task`]-related APIs are invoked.
85pub fn inject_scheduler(scheduler: &'static dyn Scheduler<Task>) {
86    SCHEDULER.call_once(|| scheduler);
87}
88
89/// Enables preemptive scheduling on the current CPU.
90///
91/// After calling this function on a CPU,
92/// a task that is executing in the user mode may get preempted
93/// if another runnable task on the same CPU is deemed more urgent by the scheduler.
94///
95/// OSTD achieves task preemption by registering a per-CPU timer callback
96/// to invoke the scheduler periodically.
97/// Thus, this function should be called _once_ on every CPU
98/// by an OSTD-based kernel during its initialization phase,
99/// after it has injected its scheduler via [`inject_scheduler`].
100pub fn enable_preemption_on_cpu() {
101    timer::register_callback_on_cpu(|| {
102        SCHEDULER.get().unwrap().mut_local_rq_with(&mut |local_rq| {
103            let should_pick_next = local_rq.update_current(UpdateFlags::Tick);
104            if should_pick_next {
105                cpu_local::set_need_preempt();
106            }
107        })
108    });
109}
110
111static SCHEDULER: Once<&'static dyn Scheduler<Task>> = Once::new();
112
113/// A SMP-aware task scheduler.
114pub trait Scheduler<T = Task>: Sync + Send {
115    /// Enqueues a runnable task.
116    ///
117    /// The scheduler implementer can perform load-balancing or some time accounting work here.
118    ///
119    /// The newly-enqueued task may have a higher priority than the currently running one on a CPU
120    /// and thus should preempt the latter.
121    /// In this case, this method returns the ID of that CPU.
122    fn enqueue(&self, runnable: Arc<T>, flags: EnqueueFlags) -> Option<CpuId>;
123
124    /// Gets an immutable access to the local runqueue of the current CPU.
125    fn local_rq_with(&self, f: &mut dyn FnMut(&dyn LocalRunQueue<T>));
126
127    /// Gets a mutable access to the local runqueue of the current CPU.
128    fn mut_local_rq_with(&self, f: &mut dyn FnMut(&mut dyn LocalRunQueue<T>));
129}
130
131/// A per-CPU, local runqueue.
132///
133/// This abstraction allows OSTD to inspect and manipulate local runqueues.
134///
135/// Conceptually, a local runqueue maintains:
136/// 1. A priority queue of runnable tasks.
137///    The definition of "priority" is left to the concrete implementation.
138/// 2. The current running task.
139///
140/// # Interactions with OSTD
141///
142/// ## Overview
143///
144/// It is crucial for implementers of `LocalRunQueue`
145/// to understand how OSTD interacts with local runqueues.
146///
147/// A local runqueue is consulted by OSTD in response to one of four scheduling events:
148/// - **Yielding**, triggered by [`Task::yield_now`], where the current task voluntarily gives up CPU time.
149/// - **Sleeping**, triggered by [`crate::sync::Waiter::wait`]
150///   or any synchronization primitive built upon it (e.g., [`crate::sync::WaitQueue`], [`crate::sync::Mutex`]),
151///   which blocks the current task until a wake-up event occurs.
152/// - **Ticking**, triggered periodically by the system timer
153///   (see [`crate::timer::TIMER_FREQ`]),
154///   which provides an opportunity to do time accounting and consider preemption.
155/// - **Exiting**, triggered when the execution logic of a task has come to an end,
156///   which informs the scheduler that the task is exiting and will never be enqueued again.
157///
158/// The general workflow for OSTD to handle a scheduling event is as follows:
159/// 1. Acquire exclusive access to the local runqueue using [`Scheduler::mut_local_rq_with`].
160/// 2. Call [`LocalRunQueue::update_current`] to update the current task's state,
161///    returning a boolean value that indicates
162///    whether the current task should and can be replaced with another runnable task.
163/// 3. If the task is about to sleep or exit, call [`LocalRunQueue::dequeue_current`]
164///    to remove it from the runqueue.
165/// 4. If the return value of `update_current` in Step 2 is true,
166///    then select the next task to run with [`LocalRunQueue::pick_next`].
167///
168/// ## When to Pick the Next Task?
169///
170/// As shown above,
171/// OSTD guarantees that `pick_next` is only called
172/// when the current task should and can be replaced.
173/// This avoids unnecessary invocations and improves efficiency.
174///
175/// But under what conditions should the current task be replaced?
176/// Two criteria must be met:
177/// 1. There exists at least one other runnable task in the runqueue.
178/// 2. That task should preempt the current one, if present.
179///
180/// Some implications of these rules:
181/// - If the runqueue is empty, `update_current` must return `false`—there's nothing to run.
182/// - If the runqueue is non-empty but the current task is absent,
183///   `update_current` should return `true`—anything is better than nothing.
184/// - If the runqueue is non-empty and the flag is `UpdateFlags::WAIT`,
185///   `update_current` should also return `true`,
186///   because the current task is about to block.
187/// - In other cases, the return value depends on the scheduler's prioritization policy.
188///   For instance, a real-time task may only be preempted by a higher-priority task
189///   or if it explicitly yields.
190///   A normal task under Linux's CFS may be preempted by a task with smaller vruntime,
191///   but never by the idle task.
192///
193/// When OSTD is unsure about whether the current task should or can be replaced,
194/// it will invoke [`LocalRunQueue::try_pick_next`], the fallible version of `pick_next`.
195///
196/// ## Internal Working
197///
198/// To guide scheduler implementers,
199/// we provide a simplified view of how OSTD interacts with local runqueues _internally_
200/// in order to handle the four scheduling events.
201///
202/// ### Yielding
203///
204/// ```
205/// # use ostd::prelude::*;
206/// # use ostd::task::{*, scheduler::*};
207/// #
208/// # fn switch_to(next: Arc<Task>) {}
209/// #
210/// /// Yields the current task.
211/// fn yield(scheduler: &'static dyn Scheduler) {
212///     let next_task_opt: Option<Arc<Task>> = scheduler.mut_local_rq_with(|local_rq| {
213///         let should_pick_next = local_rq.update_current(UpdateFlags::Yield);
214///         should_pick_next.then(|| local_rq.pick_next().clone())
215///     });
216///     let Some(next_task) = next_task_opt {
217///         switch_to(next_task);
218///     }
219/// }
220/// ```
221///
222/// ### Sleeping
223///
224/// ```
225/// # use ostd::prelude::*;
226/// # use ostd::task::{*, scheduler::*};
227/// #
228/// # fn switch_to(next: Arc<Task>) {}
229/// #
230/// /// Puts the current task to sleep.
231/// ///
232/// /// The function takes a closure to check if the task is woken.
233/// /// This function is used internally to guard against race conditions,
234/// /// where the task is woken just before it goes to sleep.
235/// fn sleep<F: Fn() -> bool>(scheduler: &'static dyn Scheduler, is_woken: F) {
236///     let mut next_task_opt: Option<Arc<Task>> = None;
237///     let mut is_first_try = true;
238///     while scheduler.mut_local_rq_with(|local_rq| {
239///         if is_first_try {
240///             if is_woken() {
241///                 return false; // exit loop
242///             }
243///             is_first_try = false;
244///
245///             let should_pick_next = local_rq.update_current(UpdateFlags::Wait);
246///             let _current = local_rq.dequeue_current();
247///             if !should_pick_next {
248///                 return true; // continue loop
249///             }
250///             next_task_opt = Some(local_rq.pick_next().clone());
251///             false // exit loop
252///         } else {
253///             next_task_opt = local_rq.try_pick_next().cloned();
254///             next_task_opt.is_none()
255///         }
256///     }) {}
257///     let Some(next_task) = next_task_opt {
258///         switch_to(next_task);
259///     }
260/// }
261/// ```
262///
263/// ### Ticking
264///
265/// ```
266/// # use ostd::prelude::*;
267/// # use ostd::task::{*, scheduler::*};
268/// #
269/// # fn switch_to(next: Arc<Task>) {}
270/// # mod cpu_local {
271/// #     fn set_need_preempt();
272/// #     fn should_preempt() -> bool;
273/// # }
274/// #
275/// /// A callback to be invoked periodically by the timer interrupt.
276/// fn on_tick(scheduler: &'static dyn Scheduler) {
277///     scheduler.mut_local_rq_with(|local_rq| {
278///         let should_pick_next = local_rq.update_current(UpdateFlags::Tick);
279///         if should_pick_next {
280///             cpu_local::set_need_preempt();
281///         }
282///     });
283/// }
284///
285/// /// A preemption point, called at an earliest convenient timing
286/// /// when OSTD can safely preempt the current running task.
287/// fn might_preempt(scheduler: &'static dyn Scheduler) {
288///     if !cpu_local::should_preempt() {
289///         return;
290///     }
291///     let next_task_opt: Option<Arc<Task>> = scheduler
292///         .mut_local_rq_with(|local_rq| local_rq.try_pick_next().cloned())
293///     let Some(next_task) = next_task_opt {
294///         switch_to(next_task);
295///     }
296/// }
297/// ```
298///
299/// ### Exiting
300///
301/// ```
302/// # use ostd::prelude::*;
303/// # use ostd::task::{*, scheduler::*};
304/// #
305/// # fn switch_to(next: Arc<Task>) {}
306/// #
307/// /// Exits the current task.
308/// fn exit(scheduler: &'static dyn Scheduler) {
309///     let mut next_task_opt: Option<Arc<Task>> = None;
310///     let mut is_first_try = true;
311///     while scheduler.mut_local_rq_with(|local_rq| {
312///         if is_first_try {
313///             is_first_try = false;
314///             let should_pick_next = local_rq.update_current(UpdateFlags::Exit);
315///             let _current = local_rq.dequeue_current();
316///             if !should_pick_next {
317///                 return true; // continue loop
318///             }
319///             next_task_opt = Some(local_rq.pick_next().clone());
320///             false // exit loop
321///         } else {
322///             next_task_opt = local_rq.try_pick_next().cloned();
323///             next_task_opt.is_none()
324///         }
325///     }) {}
326///     let next_task = next_task_opt.unwrap();
327///     switch_to(next_task);
328/// }
329/// ```
330pub trait LocalRunQueue<T = Task> {
331    /// Gets the current runnable task.
332    fn current(&self) -> Option<&Arc<T>>;
333
334    /// Updates the current runnable task's scheduling statistics and
335    /// potentially its position in the runqueue.
336    ///
337    /// The return value of this method indicates whether an invocation of `pick_next` should be followed
338    /// to find another task to replace the current one.
339    #[must_use]
340    fn update_current(&mut self, flags: UpdateFlags) -> bool;
341
342    /// Picks the next runnable task.
343    ///
344    /// This method instructs the local runqueue to pick the next runnable task and replace the current one.
345    /// A reference to the new "current" task will be returned by this method.
346    /// If the "old" current task presents, then it is still runnable and thus remains in the runqueue.
347    ///
348    /// # Panics
349    ///
350    /// As explained in the type-level Rust doc,
351    /// this method will only be invoked by OSTD after a call to `update_current` returns true.
352    /// In case that this contract is broken by the caller,
353    /// the implementer is free to exhibit any undesirable or incorrect behaviors, include panicking.
354    fn pick_next(&mut self) -> &Arc<T> {
355        self.try_pick_next().unwrap()
356    }
357
358    /// Tries to pick the next runnable task.
359    ///
360    /// This method instructs the local runqueue to pick the next runnable task on a best-effort basis.
361    /// If such a task can be picked, then this task supersedes the current task and
362    /// the new the method returns a reference to the new "current" task.
363    /// If the "old" current task presents, then it is still runnable and thus remains in the runqueue.
364    fn try_pick_next(&mut self) -> Option<&Arc<T>>;
365
366    /// Removes the current runnable task from runqueue.
367    ///
368    /// This method returns the current runnable task.
369    /// If there is no current runnable task, this method returns `None`.
370    fn dequeue_current(&mut self) -> Option<Arc<T>>;
371}
372
373/// Possible triggers of an `enqueue` action.
374#[derive(PartialEq, Copy, Clone)]
375pub enum EnqueueFlags {
376    /// Spawn a new task.
377    Spawn,
378    /// Wake a sleeping task.
379    Wake,
380}
381
382/// Possible triggers of an `update_current` action.
383#[derive(PartialEq, Copy, Clone)]
384pub enum UpdateFlags {
385    /// Timer interrupt.
386    Tick,
387    /// Task waiting.
388    Wait,
389    /// Task yielding.
390    Yield,
391    /// Task exiting.
392    Exit,
393}
394
395/// Preempts the current task.
396#[track_caller]
397pub(crate) fn might_preempt() {
398    if !cpu_local::should_preempt() {
399        return;
400    }
401    reschedule(|local_rq| {
402        let next_task_opt = local_rq.try_pick_next();
403        if let Some(next_task) = next_task_opt {
404            ReschedAction::SwitchTo(next_task.clone())
405        } else {
406            ReschedAction::DoNothing
407        }
408    })
409}
410
411/// Blocks the current task unless `has_unparked()` returns `true`.
412///
413/// Note that this method may return due to spurious wake events. It's the caller's responsibility
414/// to detect them (if necessary).
415#[track_caller]
416pub(crate) fn park_current<F>(has_unparked: F)
417where
418    F: Fn() -> bool,
419{
420    let mut current = None;
421    let mut is_first_try = true;
422
423    reschedule(|local_rq: &mut dyn LocalRunQueue| {
424        let next_task_opt = if is_first_try {
425            if has_unparked() {
426                return ReschedAction::DoNothing;
427            }
428            is_first_try = false;
429
430            // Note the race conditions: the current task may be woken after the above `has_unparked`
431            // check, but before the below `dequeue_current` action, we need to make sure that the
432            // wakeup event isn't lost.
433            //
434            // Currently, for the FIFO and CFS scheduler, `Scheduler::enqueue` will try to lock `local_rq`
435            // when the above race condition occurs, so it will wait until we finish calling the
436            // `dequeue_current` method and nothing bad will happen. This may need to be revisited
437            // after more complex schedulers are introduced.
438
439            let should_pick_next = local_rq.update_current(UpdateFlags::Wait);
440            current = local_rq.dequeue_current();
441            should_pick_next.then(|| local_rq.pick_next())
442        } else {
443            local_rq.try_pick_next()
444        };
445
446        if let Some(next_task) = next_task_opt {
447            if Arc::ptr_eq(current.as_ref().unwrap(), next_task) {
448                // The current task has been woken and picked as the next runnable task.
449                return ReschedAction::DoNothing;
450            }
451            return ReschedAction::SwitchTo(next_task.clone());
452        }
453
454        ReschedAction::Retry
455    });
456}
457
458/// Unblocks a target task.
459pub(crate) fn unpark_target(runnable: Arc<Task>) {
460    let preempt_cpu = SCHEDULER
461        .get()
462        .unwrap()
463        .enqueue(runnable, EnqueueFlags::Wake);
464    if let Some(preempt_cpu_id) = preempt_cpu {
465        set_need_preempt(preempt_cpu_id);
466    }
467}
468
469/// Enqueues a newly built task.
470///
471/// Note that the new task is not guaranteed to run at once.
472#[track_caller]
473pub(super) fn run_new_task(runnable: Arc<Task>) {
474    // FIXME: remove this check for `SCHEDULER`.
475    // Currently OSTD cannot know whether its user has injected a scheduler.
476    if !SCHEDULER.is_completed() {
477        fifo_scheduler::init();
478    }
479
480    let preempt_cpu = SCHEDULER
481        .get()
482        .unwrap()
483        .enqueue(runnable, EnqueueFlags::Spawn);
484    if let Some(preempt_cpu_id) = preempt_cpu {
485        set_need_preempt(preempt_cpu_id);
486    }
487
488    might_preempt();
489}
490
491fn set_need_preempt(cpu_id: CpuId) {
492    let preempt_guard = disable_preempt();
493
494    if preempt_guard.current_cpu() == cpu_id {
495        cpu_local::set_need_preempt();
496    } else {
497        crate::smp::inter_processor_call(&CpuSet::from(cpu_id), || {
498            cpu_local::set_need_preempt();
499        });
500    }
501}
502
503/// Dequeues the current task from its runqueue.
504///
505/// This should only be called if the current is to exit.
506#[track_caller]
507pub(super) fn exit_current() -> ! {
508    let mut is_first_try = true;
509
510    reschedule(|local_rq: &mut dyn LocalRunQueue| {
511        let next_task_opt = if is_first_try {
512            is_first_try = false;
513            let should_pick_next = local_rq.update_current(UpdateFlags::Exit);
514            let _current = local_rq.dequeue_current();
515            should_pick_next.then(|| local_rq.pick_next())
516        } else {
517            local_rq.try_pick_next()
518        };
519
520        if let Some(next_task) = next_task_opt {
521            ReschedAction::SwitchTo(next_task.clone())
522        } else {
523            ReschedAction::Retry
524        }
525    });
526
527    unreachable!()
528}
529
530/// Yields execution.
531#[track_caller]
532pub(super) fn yield_now() {
533    reschedule(|local_rq| {
534        let should_pick_next = local_rq.update_current(UpdateFlags::Yield);
535        let next_task_opt = should_pick_next.then(|| local_rq.pick_next());
536        if let Some(next_task) = next_task_opt {
537            ReschedAction::SwitchTo(next_task.clone())
538        } else {
539            ReschedAction::DoNothing
540        }
541    })
542}
543
544/// Do rescheduling by acting on the scheduling decision (`ReschedAction`) made by a
545/// user-given closure.
546///
547/// The closure makes the scheduling decision by taking the local runqueue has its input.
548#[track_caller]
549fn reschedule<F>(mut f: F)
550where
551    F: FnMut(&mut dyn LocalRunQueue) -> ReschedAction,
552{
553    // Even if the decision below is `DoNothing`, we should clear this flag. Meanwhile, to avoid
554    // race conditions, we should do this before making the decision.
555    cpu_local::clear_need_preempt();
556
557    let next_task = loop {
558        let mut action = ReschedAction::DoNothing;
559        SCHEDULER.get().unwrap().mut_local_rq_with(&mut |rq| {
560            action = f(rq);
561        });
562
563        match action {
564            ReschedAction::DoNothing => {
565                return;
566            }
567            ReschedAction::Retry => {
568                continue;
569            }
570            ReschedAction::SwitchTo(next_task) => {
571                break next_task;
572            }
573        };
574    };
575
576    // `switch_to_task` will spin if it finds that the next task is still running on some CPU core,
577    // which guarantees soundness regardless of the scheduler implementation.
578    //
579    // FIXME: The scheduler decision and context switching are not atomic, which can lead to some
580    // strange behavior even if the scheduler is implemented correctly. See "Problem 2" at
581    // <https://github.com/asterinas/asterinas/issues/1633> for details.
582    processor::switch_to_task(next_task);
583}
584
585/// Possible actions of a rescheduling.
586enum ReschedAction {
587    /// Keep running current task and do nothing.
588    DoNothing,
589    /// Loop until finding a task to swap out the current.
590    Retry,
591    /// Switch to target task.
592    SwitchTo(Arc<Task>),
593}