ostd/mm/
io.rs

1// SPDX-License-Identifier: MPL-2.0
2
3//! Abstractions for reading and writing virtual memory (VM) objects.
4//!
5//! # Safety
6//!
7//! The core virtual memory (VM) access APIs provided by this module are [`VmReader`] and
8//! [`VmWriter`], which allow for writing to or reading from a region of memory _safely_.
9//! `VmReader` and `VmWriter` objects can be constructed from memory regions of either typed memory
10//! (e.g., `&[u8]`) or untyped memory (e.g, [`UFrame`]). Behind the scene, `VmReader` and `VmWriter`
11//! must be constructed via their [`from_user_space`] and [`from_kernel_space`] methods, whose
12//! safety depends on whether the given memory regions are _valid_ or not.
13//!
14//! [`UFrame`]: crate::mm::UFrame
15//! [`from_user_space`]: `VmReader::from_user_space`
16//! [`from_kernel_space`]: `VmReader::from_kernel_space`
17//!
18//! Here is a list of conditions for memory regions to be considered valid:
19//!
20//! - The memory region as a whole must be either typed or untyped memory, not both typed and
21//!   untyped.
22//!
23//! - If the memory region is typed, we require that:
24//!   - the [validity requirements] from the official Rust documentation must be met, and
25//!   - the type of the memory region (which must exist since the memory is typed) must be
26//!     plain-old-data, so that the writer can fill it with arbitrary data safely.
27//!
28//! [validity requirements]: core::ptr#safety
29//!
30//! - If the memory region is untyped, we require that:
31//!   - the underlying pages must remain alive while the validity requirements are in effect, and
32//!   - the kernel must access the memory region using only the APIs provided in this module, but
33//!     external accesses from hardware devices or user programs do not count.
34//!
35//! We have the last requirement for untyped memory to be valid because the safety interaction with
36//! other ways to access the memory region (e.g., atomic/volatile memory loads/stores) is not
37//! currently specified. Tis may be relaxed in the future, if appropriate and necessary.
38//!
39//! Note that data races on untyped memory are explicitly allowed (since pages can be mapped to
40//! user space, making it impossible to avoid data races). However, they may produce erroneous
41//! results, such as unexpected bytes being copied, but do not cause soundness problems.
42
43use core::{marker::PhantomData, mem::MaybeUninit};
44
45use ostd_pod::Pod;
46
47use crate::{
48    Error,
49    arch::mm::{
50        __atomic_cmpxchg_fallible, __atomic_load_fallible, __memcpy_fallible, __memset_fallible,
51    },
52    mm::{
53        MAX_USERSPACE_VADDR,
54        kspace::{KERNEL_BASE_VADDR, KERNEL_END_VADDR},
55    },
56    prelude::*,
57};
58
59/// A trait that enables reading/writing data from/to a VM object,
60/// e.g., [`USegment`], [`Vec<UFrame>`] and [`UFrame`].
61///
62/// # Concurrency
63///
64/// The methods may be executed by multiple concurrent reader and writer
65/// threads. In this case, if the results of concurrent reads or writes
66/// desire predictability or atomicity, the users should add extra mechanism
67/// for such properties.
68///
69/// [`USegment`]: crate::mm::USegment
70/// [`UFrame`]: crate::mm::UFrame
71pub trait VmIo {
72    /// Reads requested data at a specified offset into a given `VmWriter`.
73    ///
74    /// # No short reads
75    ///
76    /// On success, the `writer` must be written with the requested data
77    /// completely. If, for any reason, the requested data is only partially
78    /// available, then the method shall return an error.
79    fn read(&self, offset: usize, writer: &mut VmWriter) -> Result<()>;
80
81    /// Reads a specified number of bytes at a specified offset into a given buffer.
82    ///
83    /// # No short reads
84    ///
85    /// Similar to [`read`].
86    ///
87    /// [`read`]: VmIo::read
88    fn read_bytes(&self, offset: usize, buf: &mut [u8]) -> Result<()> {
89        let mut writer = VmWriter::from(buf).to_fallible();
90        self.read(offset, &mut writer)
91    }
92
93    /// Reads a value of a specified type at a specified offset.
94    fn read_val<T: Pod>(&self, offset: usize) -> Result<T> {
95        // Why not use `MaybeUninit` for a faster implementation?
96        //
97        // ```rust
98        // let mut val: MaybeUninit<T> = MaybeUninit::uninit();
99        // let writer = unsafe {
100        //     VmWriter::from_kernel_space(val.as_mut_ptr().cast(), size_of::<T>())
101        // };
102        // self.read(offset, &mut writer.to_fallible())?;
103        // Ok(unsafe { val.assume_init() })
104        // ```
105        //
106        // The above implementation avoids initializing `val` upfront,
107        // so it is more efficient than our actual implementation.
108        // Unfortunately, it is unsound.
109        // This is because the `read` method,
110        // which could be implemented outside OSTD and thus is untrusted,
111        // may not really initialize the bits of `val` at all!
112
113        let mut val = T::new_zeroed();
114        self.read_bytes(offset, val.as_mut_bytes())?;
115        Ok(val)
116    }
117
118    /// Reads a slice of a specified type at a specified offset.
119    ///
120    /// # No short reads
121    ///
122    /// Similar to [`read`].
123    ///
124    /// [`read`]: VmIo::read
125    fn read_slice<T: Pod>(&self, offset: usize, slice: &mut [T]) -> Result<()> {
126        let len_in_bytes = size_of_val(slice);
127        let ptr = slice as *mut [T] as *mut u8;
128        // SAFETY: the slice can be transmuted to a writable byte slice since the elements
129        // are all Plain-Old-Data (Pod) types.
130        let buf = unsafe { core::slice::from_raw_parts_mut(ptr, len_in_bytes) };
131        self.read_bytes(offset, buf)
132    }
133
134    /// Writes all data from a given `VmReader` at a specified offset.
135    ///
136    /// # No short writes
137    ///
138    /// On success, the data from the `reader` must be read to the VM object entirely.
139    /// If, for any reason, the input data can only be written partially,
140    /// then the method shall return an error.
141    fn write(&self, offset: usize, reader: &mut VmReader) -> Result<()>;
142
143    /// Writes a specified number of bytes from a given buffer at a specified offset.
144    ///
145    /// # No short writes
146    ///
147    /// Similar to [`write`].
148    ///
149    /// [`write`]: VmIo::write
150    fn write_bytes(&self, offset: usize, buf: &[u8]) -> Result<()> {
151        let mut reader = VmReader::from(buf).to_fallible();
152        self.write(offset, &mut reader)
153    }
154
155    /// Writes a value of a specified type at a specified offset.
156    fn write_val<T: Pod>(&self, offset: usize, new_val: &T) -> Result<()> {
157        self.write_bytes(offset, new_val.as_bytes())?;
158        Ok(())
159    }
160
161    /// Writes a slice of a specified type at a specified offset.
162    ///
163    /// # No short write
164    ///
165    /// Similar to [`write`].
166    ///
167    /// [`write`]: VmIo::write
168    fn write_slice<T: Pod>(&self, offset: usize, slice: &[T]) -> Result<()> {
169        let len_in_bytes = size_of_val(slice);
170        let ptr = slice as *const [T] as *const u8;
171        // SAFETY: the slice can be transmuted to a readable byte slice since the elements
172        // are all Plain-Old-Data (Pod) types.
173        let buf = unsafe { core::slice::from_raw_parts(ptr, len_in_bytes) };
174        self.write_bytes(offset, buf)
175    }
176}
177
178/// A trait that enables filling bytes (e.g., filling zeros) to a VM object.
179pub trait VmIoFill {
180    /// Writes `len` zeros at a specified offset.
181    ///
182    /// Unlike the methods in [`VmIo`], this method allows for short writes because `len` can be
183    /// effectively unbounded. However, if not all bytes can be written successfully, an `Err(_)`
184    /// will be returned with the error and the number of zeros that have been written thus far.
185    ///
186    /// # A slow, general implementation
187    ///
188    /// Suppose that [`VmIo`] has already been implemented for the type,
189    /// this method can be implemented in the following general way.
190    ///
191    /// ```rust
192    /// fn fill_zeros(&self, offset: usize, len: usize) -> core::result::Result<(), (Error, usize)> {
193    ///     for i in 0..len {
194    ///         match self.write_slice(offset + i, &[0u8]) {
195    ///             Ok(()) => continue,
196    ///             Err(err) => return Err((err, i)),
197    ///         }
198    ///     }
199    ///     Ok(())
200    /// }
201    /// ```
202    ///
203    /// But we choose not to provide a general, default implementation
204    /// because doing so would make it too easy for a concrete type of `VmIoFill`
205    /// to settle with a slower implementation for such a performance-sensitive operation.
206    fn fill_zeros(&self, offset: usize, len: usize) -> core::result::Result<(), (Error, usize)>;
207}
208
209/// A trait that enables reading/writing data from/to a VM object using one non-tearing memory
210/// load/store.
211///
212/// See also [`VmIo`], which enables reading/writing data from/to a VM object without the guarantee
213/// of using one non-tearing memory load/store.
214pub trait VmIoOnce {
215    /// Reads a value of the `PodOnce` type at the specified offset using one non-tearing memory
216    /// load.
217    ///
218    /// Except that the offset is specified explicitly, the semantics of this method is the same as
219    /// [`VmReader::read_once`].
220    fn read_once<T: PodOnce>(&self, offset: usize) -> Result<T>;
221
222    /// Writes a value of the `PodOnce` type at the specified offset using one non-tearing memory
223    /// store.
224    ///
225    /// Except that the offset is specified explicitly, the semantics of this method is the same as
226    /// [`VmWriter::write_once`].
227    fn write_once<T: PodOnce>(&self, offset: usize, new_val: &T) -> Result<()>;
228}
229
230/// A marker type used for [`VmReader`] and [`VmWriter`],
231/// representing whether reads or writes on the underlying memory region are fallible.
232pub enum Fallible {}
233
234/// A marker type used for [`VmReader`] and [`VmWriter`],
235/// representing whether reads or writes on the underlying memory region are infallible.
236pub enum Infallible {}
237
238/// Copies `len` bytes from `src` to `dst`.
239///
240/// # Safety
241///
242/// - `src` must be [valid] for reads of `len` bytes.
243/// - `dst` must be [valid] for writes of `len` bytes.
244///
245/// [valid]: crate::mm::io#safety
246unsafe fn memcpy(dst: *mut u8, src: *const u8, len: usize) {
247    // This method is implemented by calling `volatile_copy_memory`. Note that even with the
248    // "volatile" keyword, data races are still considered undefined behavior (UB) in both the Rust
249    // documentation and the C/C++ standards. In general, UB makes the behavior of the entire
250    // program unpredictable, usually due to compiler optimizations that assume the absence of UB.
251    // However, in this particular case, considering that the Linux kernel uses the "volatile"
252    // keyword to implement `READ_ONCE` and `WRITE_ONCE`, the compiler is extremely unlikely to
253    // break our code unless it also breaks the Linux kernel.
254    //
255    // For more details and future possibilities, see
256    // <https://github.com/asterinas/asterinas/pull/1001#discussion_r1667317406>.
257
258    // SAFETY: The safety is guaranteed by the safety preconditions and the explanation above.
259    unsafe { core::intrinsics::volatile_copy_memory(dst, src, len) };
260}
261
262/// Fills `len` bytes of memory at `dst` with the specified `value`.
263///
264/// # Safety
265///
266/// - `dst` must be [valid] for writes of `len` bytes.
267///
268/// [valid]: crate::mm::io#safety
269unsafe fn memset(dst: *mut u8, value: u8, len: usize) {
270    // SAFETY: The safety is guaranteed by the safety preconditions and the explanation above.
271    unsafe {
272        core::intrinsics::volatile_set_memory(dst, value, len);
273    }
274}
275
276/// Copies `len` bytes from `src` to `dst`.
277/// This function will early stop copying if encountering an unresolvable page fault.
278///
279/// Returns the number of successfully copied bytes.
280///
281/// In the following cases, this method may cause unexpected bytes to be copied, but will not cause
282/// safety problems as long as the safety requirements are met:
283/// - The source and destination overlap.
284/// - The current context is not associated with valid user space (e.g., in the kernel thread).
285///
286/// # Safety
287///
288/// - `src` must either be [valid] for reads of `len` bytes or be in user space for `len` bytes.
289/// - `dst` must either be [valid] for writes of `len` bytes or be in user space for `len` bytes.
290///
291/// [valid]: crate::mm::io#safety
292unsafe fn memcpy_fallible(dst: *mut u8, src: *const u8, len: usize) -> usize {
293    // SAFETY: The safety is upheld by the caller.
294    let failed_bytes = unsafe { __memcpy_fallible(dst, src, len) };
295    len - failed_bytes
296}
297
298/// Fills `len` bytes of memory at `dst` with the specified `value`.
299/// This function will early stop filling if encountering an unresolvable page fault.
300///
301/// Returns the number of successfully set bytes.
302///
303/// # Safety
304///
305/// - `dst` must either be [valid] for writes of `len` bytes or be in user space for `len` bytes.
306///
307/// [valid]: crate::mm::io#safety
308unsafe fn memset_fallible(dst: *mut u8, value: u8, len: usize) -> usize {
309    // SAFETY: The safety is upheld by the caller.
310    let failed_bytes = unsafe { __memset_fallible(dst, value, len) };
311    len - failed_bytes
312}
313
314/// Fallible memory read from a `VmWriter`.
315pub trait FallibleVmRead<F> {
316    /// Reads all data into the writer until one of the three conditions is met:
317    /// 1. The reader has no remaining data.
318    /// 2. The writer has no available space.
319    /// 3. The reader/writer encounters some error.
320    ///
321    /// On success, the number of bytes read is returned;
322    /// On error, both the error and the number of bytes read so far are returned.
323    fn read_fallible(
324        &mut self,
325        writer: &mut VmWriter<'_, F>,
326    ) -> core::result::Result<usize, (Error, usize)>;
327}
328
329/// Fallible memory write from a `VmReader`.
330pub trait FallibleVmWrite<F> {
331    /// Writes all data from the reader until one of the three conditions is met:
332    /// 1. The reader has no remaining data.
333    /// 2. The writer has no available space.
334    /// 3. The reader/writer encounters some error.
335    ///
336    /// On success, the number of bytes written is returned;
337    /// On error, both the error and the number of bytes written so far are returned.
338    fn write_fallible(
339        &mut self,
340        reader: &mut VmReader<'_, F>,
341    ) -> core::result::Result<usize, (Error, usize)>;
342}
343
344/// `VmReader` is a reader for reading data from a contiguous range of memory.
345///
346/// The memory range read by `VmReader` can be in either kernel space or user space.
347/// When the operating range is in kernel space, the memory within that range
348/// is guaranteed to be valid, and the corresponding memory reads are infallible.
349/// When the operating range is in user space, it is ensured that the page table of
350/// the process creating the `VmReader` is active for the duration of `'a`,
351/// and the corresponding memory reads are considered fallible.
352///
353/// When perform reading with a `VmWriter`, if one of them represents typed memory,
354/// it can ensure that the reading range in this reader and writing range in the
355/// writer are not overlapped.
356///
357/// NOTE: The overlap mentioned above is at both the virtual address level
358/// and physical address level. There is not guarantee for the operation results
359/// of `VmReader` and `VmWriter` in overlapping untyped addresses, and it is
360/// the user's responsibility to handle this situation.
361pub struct VmReader<'a, Fallibility = Fallible> {
362    cursor: *const u8,
363    end: *const u8,
364    phantom: PhantomData<(&'a [u8], Fallibility)>,
365}
366
367// `Clone` can be implemented for `VmReader`
368// because it either points to untyped memory or represents immutable references.
369// Note that we cannot implement `Clone` for `VmWriter`
370// because it can represent mutable references, which must remain exclusive.
371impl<Fallibility> Clone for VmReader<'_, Fallibility> {
372    fn clone(&self) -> Self {
373        Self {
374            cursor: self.cursor,
375            end: self.end,
376            phantom: PhantomData,
377        }
378    }
379}
380
381macro_rules! impl_read_fallible {
382    ($reader_fallibility:ty, $writer_fallibility:ty) => {
383        impl<'a> FallibleVmRead<$writer_fallibility> for VmReader<'a, $reader_fallibility> {
384            fn read_fallible(
385                &mut self,
386                writer: &mut VmWriter<'_, $writer_fallibility>,
387            ) -> core::result::Result<usize, (Error, usize)> {
388                let copy_len = self.remain().min(writer.avail());
389                if copy_len == 0 {
390                    return Ok(0);
391                }
392
393                // SAFETY: The source and destination are subsets of memory ranges specified by
394                // the reader and writer, so they are either valid for reading and writing or in
395                // user space.
396                let copied_len = unsafe { memcpy_fallible(writer.cursor, self.cursor, copy_len) };
397                self.cursor = self.cursor.wrapping_add(copied_len);
398                writer.cursor = writer.cursor.wrapping_add(copied_len);
399
400                if copied_len < copy_len {
401                    Err((Error::PageFault, copied_len))
402                } else {
403                    Ok(copied_len)
404                }
405            }
406        }
407    };
408}
409
410macro_rules! impl_write_fallible {
411    ($writer_fallibility:ty, $reader_fallibility:ty) => {
412        impl<'a> FallibleVmWrite<$reader_fallibility> for VmWriter<'a, $writer_fallibility> {
413            fn write_fallible(
414                &mut self,
415                reader: &mut VmReader<'_, $reader_fallibility>,
416            ) -> core::result::Result<usize, (Error, usize)> {
417                reader.read_fallible(self)
418            }
419        }
420    };
421}
422
423impl_read_fallible!(Fallible, Infallible);
424impl_read_fallible!(Fallible, Fallible);
425impl_read_fallible!(Infallible, Fallible);
426impl_write_fallible!(Fallible, Infallible);
427impl_write_fallible!(Fallible, Fallible);
428impl_write_fallible!(Infallible, Fallible);
429
430impl<'a> VmReader<'a, Infallible> {
431    /// Constructs a `VmReader` from a pointer and a length, which represents
432    /// a memory range in kernel space.
433    ///
434    /// # Safety
435    ///
436    /// `ptr` must be [valid] for reads of `len` bytes during the entire lifetime `a`.
437    ///
438    /// [valid]: crate::mm::io#safety
439    pub unsafe fn from_kernel_space(ptr: *const u8, len: usize) -> Self {
440        // Rust is allowed to give the reference to a zero-sized object a very small address,
441        // falling out of the kernel virtual address space range.
442        // So when `len` is zero, we should not and need not to check `ptr`.
443        debug_assert!(len == 0 || KERNEL_BASE_VADDR <= ptr.addr());
444        debug_assert!(len == 0 || ptr.addr().checked_add(len).unwrap() <= KERNEL_END_VADDR);
445
446        Self {
447            cursor: ptr,
448            end: ptr.wrapping_add(len),
449            phantom: PhantomData,
450        }
451    }
452
453    /// Reads all data into the writer until one of the two conditions is met:
454    /// 1. The reader has no remaining data.
455    /// 2. The writer has no available space.
456    ///
457    /// Returns the number of bytes read.
458    pub fn read(&mut self, writer: &mut VmWriter<'_, Infallible>) -> usize {
459        let copy_len = self.remain().min(writer.avail());
460        if copy_len == 0 {
461            return 0;
462        }
463
464        // SAFETY: The source and destination are subsets of memory ranges specified by the reader
465        // and writer, so they are valid for reading and writing.
466        unsafe { memcpy(writer.cursor, self.cursor, copy_len) };
467        self.cursor = self.cursor.wrapping_add(copy_len);
468        writer.cursor = writer.cursor.wrapping_add(copy_len);
469
470        copy_len
471    }
472
473    /// Reads a value of `Pod` type.
474    ///
475    /// If the length of the `Pod` type exceeds `self.remain()`,
476    /// this method will return `Err`.
477    pub fn read_val<T: Pod>(&mut self) -> Result<T> {
478        if self.remain() < size_of::<T>() {
479            return Err(Error::InvalidArgs);
480        }
481
482        let mut val = MaybeUninit::<T>::uninit();
483
484        // SAFETY:
485        // - The memory range points to typed memory.
486        // - The validity requirements for write accesses are met because the pointer is converted
487        //   from a mutable pointer where the underlying storage outlives the temporary lifetime
488        //   and no other Rust references to the same storage exist during the lifetime.
489        // - The type, i.e., `T`, is plain-old-data.
490        let mut writer =
491            unsafe { VmWriter::from_kernel_space(val.as_mut_ptr().cast(), size_of::<T>()) };
492        self.read(&mut writer);
493        debug_assert!(!writer.has_avail());
494
495        // SAFETY:
496        // - `self.read` has initialized all the bytes in `val`.
497        // - The type is plain-old-data.
498        let val_inited = unsafe { val.assume_init() };
499        Ok(val_inited)
500    }
501
502    /// Reads a value of the `PodOnce` type using one non-tearing memory load.
503    ///
504    /// If the length of the `PodOnce` type exceeds `self.remain()`, this method will return `Err`.
505    ///
506    /// This method will not compile if the `Pod` type is too large for the current architecture
507    /// and the operation must be tear into multiple memory loads.
508    ///
509    /// # Panics
510    ///
511    /// This method will panic if the current position of the reader does not meet the alignment
512    /// requirements of type `T`.
513    pub fn read_once<T: PodOnce>(&mut self) -> Result<T> {
514        if self.remain() < size_of::<T>() {
515            return Err(Error::InvalidArgs);
516        }
517
518        let cursor = self.cursor.cast::<T>();
519        assert!(cursor.is_aligned());
520
521        const { assert!(pod_once_impls::is_non_tearing::<T>()) };
522
523        // SAFETY: We have checked that the number of bytes remaining is at least the size of `T`
524        // and that the cursor is properly aligned with respect to the type `T`. All other safety
525        // requirements are the same as for `Self::read`.
526        let val = unsafe { cursor.read_volatile() };
527        self.cursor = self.cursor.wrapping_add(size_of::<T>());
528
529        Ok(val)
530    }
531
532    // Currently, there are no volatile atomic operations in `core::intrinsics`. Therefore, we do
533    // not provide an infallible implementation of `VmReader::atomic_load`.
534
535    /// Converts to a fallible reader.
536    pub fn to_fallible(self) -> VmReader<'a, Fallible> {
537        // It is safe to construct a fallible reader since an infallible reader covers the
538        // capabilities of a fallible reader.
539        VmReader {
540            cursor: self.cursor,
541            end: self.end,
542            phantom: PhantomData,
543        }
544    }
545}
546
547impl VmReader<'_, Fallible> {
548    /// Constructs a `VmReader` from a pointer and a length, which represents
549    /// a memory range in user space.
550    ///
551    /// # Safety
552    ///
553    /// The virtual address range `ptr..ptr + len` must be in user space.
554    pub unsafe fn from_user_space(ptr: *const u8, len: usize) -> Self {
555        debug_assert!(ptr.addr().checked_add(len).unwrap() <= MAX_USERSPACE_VADDR);
556
557        Self {
558            cursor: ptr,
559            end: ptr.wrapping_add(len),
560            phantom: PhantomData,
561        }
562    }
563
564    /// Reads a value of `Pod` type.
565    ///
566    /// If the length of the `Pod` type exceeds `self.remain()`,
567    /// or the value can not be read completely,
568    /// this method will return `Err`.
569    ///
570    /// If the memory read failed, this method will return `Err`
571    /// and the current reader's cursor remains pointing to
572    /// the original starting position.
573    pub fn read_val<T: Pod>(&mut self) -> Result<T> {
574        if self.remain() < size_of::<T>() {
575            return Err(Error::InvalidArgs);
576        }
577
578        let mut val = MaybeUninit::<T>::uninit();
579
580        // SAFETY:
581        // - The memory range points to typed memory.
582        // - The validity requirements for write accesses are met because the pointer is converted
583        //   from a mutable pointer where the underlying storage outlives the temporary lifetime
584        //   and no other Rust references to the same storage exist during the lifetime.
585        // - The type, i.e., `T`, is plain-old-data.
586        let mut writer =
587            unsafe { VmWriter::from_kernel_space(val.as_mut_ptr().cast(), size_of::<T>()) };
588        self.read_fallible(&mut writer)
589            .map_err(|(err, copied_len)| {
590                // The `copied_len` is the number of bytes read so far.
591                // So the `cursor` can be moved back to the original position.
592                self.cursor = self.cursor.wrapping_sub(copied_len);
593                err
594            })?;
595        debug_assert!(!writer.has_avail());
596
597        // SAFETY:
598        // - `self.read_fallible` has initialized all the bytes in `val`.
599        // - The type is plain-old-data.
600        let val_inited = unsafe { val.assume_init() };
601        Ok(val_inited)
602    }
603
604    /// Atomically loads a `PodAtomic` value.
605    ///
606    /// Regardless of whether it is successful, the cursor of the reader will not move.
607    ///
608    /// This method only guarantees the atomicity of the specific operation. There are no
609    /// synchronization constraints on other memory accesses. This aligns with the [Relaxed
610    /// ordering](https://en.cppreference.com/w/cpp/atomic/memory_order.html#Relaxed_ordering)
611    /// specified in the C++11 memory model.
612    ///
613    /// This method will fail with errors if
614    ///  1. the remaining space of the reader is less than `size_of::<T>()` bytes, or
615    ///  2. the memory operation fails due to an unresolvable page fault.
616    ///
617    /// # Panics
618    ///
619    /// This method will panic if the memory location is not aligned on an `align_of::<T>()`-byte
620    /// boundary.
621    pub fn atomic_load<T: PodAtomic>(&self) -> Result<T> {
622        if self.remain() < size_of::<T>() {
623            return Err(Error::InvalidArgs);
624        }
625
626        let cursor = self.cursor.cast::<T>();
627        assert!(cursor.is_aligned());
628
629        // SAFETY:
630        // 1. The cursor is either valid for reading or in user space for `size_of::<T>()` bytes.
631        // 2. The cursor is aligned on an `align_of::<T>()`-byte boundary.
632        unsafe { T::atomic_load_fallible(cursor) }
633    }
634}
635
636impl<Fallibility> VmReader<'_, Fallibility> {
637    /// Returns the number of bytes for the remaining data.
638    pub fn remain(&self) -> usize {
639        self.end.addr() - self.cursor.addr()
640    }
641
642    /// Returns the cursor pointer, which refers to the address of the next byte to read.
643    pub fn cursor(&self) -> *const u8 {
644        self.cursor
645    }
646
647    /// Returns if it has remaining data to read.
648    pub fn has_remain(&self) -> bool {
649        self.remain() > 0
650    }
651
652    /// Limits the length of remaining data.
653    ///
654    /// This method ensures the post condition of `self.remain() <= max_remain`.
655    pub fn limit(&mut self, max_remain: usize) -> &mut Self {
656        if max_remain < self.remain() {
657            self.end = self.cursor.wrapping_add(max_remain);
658        }
659
660        self
661    }
662
663    /// Skips the first `nbytes` bytes of data.
664    /// The length of remaining data is decreased accordingly.
665    ///
666    /// # Panics
667    ///
668    /// If `nbytes` is greater than `self.remain()`, then the method panics.
669    pub fn skip(&mut self, nbytes: usize) -> &mut Self {
670        assert!(nbytes <= self.remain());
671        self.cursor = self.cursor.wrapping_add(nbytes);
672
673        self
674    }
675}
676
677impl<'a> From<&'a [u8]> for VmReader<'a, Infallible> {
678    fn from(slice: &'a [u8]) -> Self {
679        // SAFETY:
680        // - The memory range points to typed memory.
681        // - The validity requirements for read accesses are met because the pointer is converted
682        //   from an immutable reference that outlives the lifetime `'a`.
683        // - The type, i.e., the `u8` slice, is plain-old-data.
684        unsafe { Self::from_kernel_space(slice.as_ptr(), slice.len()) }
685    }
686}
687
688/// `VmWriter` is a writer for writing data to a contiguous range of memory.
689///
690/// The memory range write by `VmWriter` can be in either kernel space or user space.
691/// When the operating range is in kernel space, the memory within that range
692/// is guaranteed to be valid, and the corresponding memory writes are infallible.
693/// When the operating range is in user space, it is ensured that the page table of
694/// the process creating the `VmWriter` is active for the duration of `'a`,
695/// and the corresponding memory writes are considered fallible.
696///
697/// When perform writing with a `VmReader`, if one of them represents typed memory,
698/// it can ensure that the writing range in this writer and reading range in the
699/// reader are not overlapped.
700///
701/// NOTE: The overlap mentioned above is at both the virtual address level
702/// and physical address level. There is not guarantee for the operation results
703/// of `VmReader` and `VmWriter` in overlapping untyped addresses, and it is
704/// the user's responsibility to handle this situation.
705pub struct VmWriter<'a, Fallibility = Fallible> {
706    cursor: *mut u8,
707    end: *mut u8,
708    phantom: PhantomData<(&'a mut [u8], Fallibility)>,
709}
710
711impl<'a> VmWriter<'a, Infallible> {
712    /// Constructs a `VmWriter` from a pointer and a length, which represents
713    /// a memory range in kernel space.
714    ///
715    /// # Safety
716    ///
717    /// `ptr` must be [valid] for writes of `len` bytes during the entire lifetime `a`.
718    ///
719    /// [valid]: crate::mm::io#safety
720    pub unsafe fn from_kernel_space(ptr: *mut u8, len: usize) -> Self {
721        // If casting a zero sized slice to a pointer, the pointer may be null
722        // and does not reside in our kernel space range.
723        debug_assert!(len == 0 || KERNEL_BASE_VADDR <= ptr.addr());
724        debug_assert!(len == 0 || ptr.addr().checked_add(len).unwrap() <= KERNEL_END_VADDR);
725
726        Self {
727            cursor: ptr,
728            end: ptr.wrapping_add(len),
729            phantom: PhantomData,
730        }
731    }
732
733    /// Writes all data from the reader until one of the two conditions is met:
734    /// 1. The reader has no remaining data.
735    /// 2. The writer has no available space.
736    ///
737    /// Returns the number of bytes written.
738    pub fn write(&mut self, reader: &mut VmReader<'_, Infallible>) -> usize {
739        reader.read(self)
740    }
741
742    /// Writes a value of `Pod` type.
743    ///
744    /// If the length of the `Pod` type exceeds `self.avail()`,
745    /// this method will return `Err`.
746    pub fn write_val<T: Pod>(&mut self, new_val: &T) -> Result<()> {
747        if self.avail() < size_of::<T>() {
748            return Err(Error::InvalidArgs);
749        }
750
751        let mut reader = VmReader::from(new_val.as_bytes());
752        self.write(&mut reader);
753        Ok(())
754    }
755
756    /// Writes a value of the `PodOnce` type using one non-tearing memory store.
757    ///
758    /// If the length of the `PodOnce` type exceeds `self.remain()`, this method will return `Err`.
759    ///
760    /// # Panics
761    ///
762    /// This method will panic if the current position of the writer does not meet the alignment
763    /// requirements of type `T`.
764    pub fn write_once<T: PodOnce>(&mut self, new_val: &T) -> Result<()> {
765        if self.avail() < size_of::<T>() {
766            return Err(Error::InvalidArgs);
767        }
768
769        let cursor = self.cursor.cast::<T>();
770        assert!(cursor.is_aligned());
771
772        const { assert!(pod_once_impls::is_non_tearing::<T>()) };
773
774        // SAFETY: We have checked that the number of bytes remaining is at least the size of `T`
775        // and that the cursor is properly aligned with respect to the type `T`. All other safety
776        // requirements are the same as for `Self::write`.
777        unsafe { cursor.write_volatile(*new_val) };
778        self.cursor = self.cursor.wrapping_add(size_of::<T>());
779
780        Ok(())
781    }
782
783    // Currently, there are no volatile atomic operations in `core::intrinsics`. Therefore, we do
784    // not provide an infallible implementation of `VmWriter::atomic_compare_exchange`.
785
786    /// Writes `len` zeros to the target memory.
787    ///
788    /// This method attempts to fill up to `len` bytes with zeros. If the available
789    /// memory from the current cursor position is less than `len`, it will only fill
790    /// the available space.
791    pub fn fill_zeros(&mut self, len: usize) -> usize {
792        let len_to_set = self.avail().min(len);
793        if len_to_set == 0 {
794            return 0;
795        }
796
797        // SAFETY: The destination is a subset of the memory range specified by
798        // the current writer, so it is valid for writing.
799        unsafe { memset(self.cursor, 0u8, len_to_set) };
800        self.cursor = self.cursor.wrapping_add(len_to_set);
801
802        len_to_set
803    }
804
805    /// Converts to a fallible writer.
806    pub fn to_fallible(self) -> VmWriter<'a, Fallible> {
807        // It is safe to construct a fallible reader since an infallible reader covers the
808        // capabilities of a fallible reader.
809        VmWriter {
810            cursor: self.cursor,
811            end: self.end,
812            phantom: PhantomData,
813        }
814    }
815}
816
817impl VmWriter<'_, Fallible> {
818    /// Constructs a `VmWriter` from a pointer and a length, which represents
819    /// a memory range in user space.
820    ///
821    /// The current context should be consistently associated with valid user space during the
822    /// entire lifetime `'a`. This is for correct semantics and is not a safety requirement.
823    ///
824    /// # Safety
825    ///
826    /// `ptr` must be in user space for `len` bytes.
827    pub unsafe fn from_user_space(ptr: *mut u8, len: usize) -> Self {
828        debug_assert!(ptr.addr().checked_add(len).unwrap() <= MAX_USERSPACE_VADDR);
829
830        Self {
831            cursor: ptr,
832            end: ptr.wrapping_add(len),
833            phantom: PhantomData,
834        }
835    }
836
837    /// Writes a value of `Pod` type.
838    ///
839    /// If the length of the `Pod` type exceeds `self.avail()`,
840    /// or the value can not be write completely,
841    /// this method will return `Err`.
842    ///
843    /// If the memory write failed, this method will return `Err`
844    /// and the current writer's cursor remains pointing to
845    /// the original starting position.
846    pub fn write_val<T: Pod>(&mut self, new_val: &T) -> Result<()> {
847        if self.avail() < size_of::<T>() {
848            return Err(Error::InvalidArgs);
849        }
850
851        let mut reader = VmReader::from(new_val.as_bytes());
852        self.write_fallible(&mut reader)
853            .map_err(|(err, copied_len)| {
854                // The `copied_len` is the number of bytes written so far.
855                // So the `cursor` can be moved back to the original position.
856                self.cursor = self.cursor.wrapping_sub(copied_len);
857                err
858            })?;
859        Ok(())
860    }
861
862    /// Atomically compares and exchanges a `PodAtomic` value.
863    ///
864    /// This method compares `old_val` with the value pointed by `self` and, if they are equal,
865    /// updates it with `new_val`.
866    ///
867    /// The value that was previously in memory will be returned, along with a boolean denoting
868    /// whether the compare-and-exchange succeeds. The caller usually wants to retry if this
869    /// flag is false, passing the most recent value that was returned by this method.
870    ///
871    /// The caller is required to provide a reader which points to the exact same memory location
872    /// to ensure that reading from the memory is allowed.
873    ///
874    /// Regardless of whether it is successful, the cursors of the reader and writer will not move.
875    ///
876    /// This method only guarantees the atomicity of the specific operation. There are no
877    /// synchronization constraints on other memory accesses. This aligns with the [Relaxed
878    /// ordering](https://en.cppreference.com/w/cpp/atomic/memory_order.html#Relaxed_ordering)
879    /// specified in the C++11 memory model.
880    ///
881    /// Since the operation does not involve memory locks, it can't prevent the [ABA
882    /// problem](https://en.wikipedia.org/wiki/ABA_problem).
883    ///
884    /// This method will fail with errors if:
885    ///  1. the remaining space of the reader or the available space of the writer are less than
886    ///     `size_of::<T>()` bytes, or
887    ///  2. the memory operation fails due to an unresolvable page fault.
888    ///
889    /// # Panics
890    ///
891    /// This method will panic if:
892    ///  1. the reader and the writer does not point to the same memory location, or
893    ///  2. the memory location is not aligned on an `align_of::<T>()`-byte boundary.
894    pub fn atomic_compare_exchange<T>(
895        &self,
896        reader: &VmReader,
897        old_val: T,
898        new_val: T,
899    ) -> Result<(T, bool)>
900    where
901        T: PodAtomic + Eq,
902    {
903        if self.avail() < size_of::<T>() || reader.remain() < size_of::<T>() {
904            return Err(Error::InvalidArgs);
905        }
906
907        assert_eq!(self.cursor.cast_const(), reader.cursor);
908
909        let cursor = self.cursor.cast::<T>();
910        assert!(cursor.is_aligned());
911
912        // SAFETY:
913        // 1. The cursor is either valid for reading and writing or in user space for
914        //    `size_of::<T>()` bytes.
915        // 2. The cursor is aligned on an `align_of::<T>()`-byte boundary.
916        let cur_val = unsafe { T::atomic_cmpxchg_fallible(cursor, old_val, new_val)? };
917
918        Ok((cur_val, old_val == cur_val))
919    }
920
921    /// Writes `len` zeros to the target memory.
922    ///
923    /// This method attempts to fill up to `len` bytes with zeros. If the available
924    /// memory from the current cursor position is less than `len`, it will only fill
925    /// the available space.
926    ///
927    /// If the memory write failed due to an unresolvable page fault, this method
928    /// will return `Err` with the length set so far.
929    pub fn fill_zeros(&mut self, len: usize) -> core::result::Result<usize, (Error, usize)> {
930        let len_to_set = self.avail().min(len);
931        if len_to_set == 0 {
932            return Ok(0);
933        }
934
935        // SAFETY: The destination is a subset of the memory range specified by
936        // the current writer, so it is either valid for writing or in user space.
937        let set_len = unsafe { memset_fallible(self.cursor, 0u8, len_to_set) };
938        self.cursor = self.cursor.wrapping_add(set_len);
939
940        if set_len < len_to_set {
941            Err((Error::PageFault, set_len))
942        } else {
943            Ok(len_to_set)
944        }
945    }
946}
947
948impl<Fallibility> VmWriter<'_, Fallibility> {
949    /// Returns the number of bytes for the available space.
950    pub fn avail(&self) -> usize {
951        self.end.addr() - self.cursor.addr()
952    }
953
954    /// Returns the cursor pointer, which refers to the address of the next byte to write.
955    pub fn cursor(&self) -> *mut u8 {
956        self.cursor
957    }
958
959    /// Returns if it has available space to write.
960    pub fn has_avail(&self) -> bool {
961        self.avail() > 0
962    }
963
964    /// Limits the length of available space.
965    ///
966    /// This method ensures the post condition of `self.avail() <= max_avail`.
967    pub fn limit(&mut self, max_avail: usize) -> &mut Self {
968        if max_avail < self.avail() {
969            self.end = self.cursor.wrapping_add(max_avail);
970        }
971
972        self
973    }
974
975    /// Skips the first `nbytes` bytes of data.
976    /// The length of available space is decreased accordingly.
977    ///
978    /// # Panics
979    ///
980    /// If `nbytes` is greater than `self.avail()`, then the method panics.
981    pub fn skip(&mut self, nbytes: usize) -> &mut Self {
982        assert!(nbytes <= self.avail());
983        self.cursor = self.cursor.wrapping_add(nbytes);
984
985        self
986    }
987}
988
989impl<'a> From<&'a mut [u8]> for VmWriter<'a, Infallible> {
990    fn from(slice: &'a mut [u8]) -> Self {
991        // SAFETY:
992        // - The memory range points to typed memory.
993        // - The validity requirements for write accesses are met because the pointer is converted
994        //   from a mutable reference that outlives the lifetime `'a`.
995        // - The type, i.e., the `u8` slice, is plain-old-data.
996        unsafe { Self::from_kernel_space(slice.as_mut_ptr(), slice.len()) }
997    }
998}
999
1000/// A marker trait for POD types that can be read or written with one instruction.
1001///
1002/// This trait is mostly a hint, since it's safe and can be implemented for _any_ POD type. If it
1003/// is implemented for a type that cannot be read or written with a single instruction, calling
1004/// `read_once`/`write_once` will lead to a failed compile-time assertion.
1005pub trait PodOnce: Pod {}
1006
1007#[cfg(any(
1008    target_arch = "x86_64",
1009    target_arch = "riscv64",
1010    target_arch = "loongarch64"
1011))]
1012mod pod_once_impls {
1013    use super::PodOnce;
1014
1015    impl PodOnce for u8 {}
1016    impl PodOnce for u16 {}
1017    impl PodOnce for u32 {}
1018    impl PodOnce for u64 {}
1019    impl PodOnce for usize {}
1020    impl PodOnce for i8 {}
1021    impl PodOnce for i16 {}
1022    impl PodOnce for i32 {}
1023    impl PodOnce for i64 {}
1024    impl PodOnce for isize {}
1025
1026    /// Checks whether the memory operation created by `ptr::read_volatile` and
1027    /// `ptr::write_volatile` doesn't tear.
1028    ///
1029    /// Note that the Rust documentation makes no such guarantee, and even the wording in the LLVM
1030    /// LangRef is ambiguous. But this is unlikely to break in practice because the Linux kernel
1031    /// also uses "volatile" semantics to implement `READ_ONCE`/`WRITE_ONCE`.
1032    pub(super) const fn is_non_tearing<T>() -> bool {
1033        let size = size_of::<T>();
1034
1035        size == 1 || size == 2 || size == 4 || size == 8
1036    }
1037}
1038
1039/// A marker trait for POD types that can be read or written atomically.
1040pub trait PodAtomic: Pod {
1041    /// Atomically loads a value.
1042    /// This function will return errors if encountering an unresolvable page fault.
1043    ///
1044    /// Returns the loaded value.
1045    ///
1046    /// # Safety
1047    ///
1048    /// - `ptr` must either be [valid] for writes of `size_of::<T>()` bytes or be in user
1049    ///   space for `size_of::<T>()` bytes.
1050    /// - `ptr` must be aligned on an `align_of::<T>()`-byte boundary.
1051    ///
1052    /// [valid]: crate::mm::io#safety
1053    #[doc(hidden)]
1054    unsafe fn atomic_load_fallible(ptr: *const Self) -> Result<Self>;
1055
1056    /// Atomically compares and exchanges a value.
1057    /// This function will return errors if encountering an unresolvable page fault.
1058    ///
1059    /// Returns the previous value.
1060    /// `new_val` will be written if and only if the previous value is equal to `old_val`.
1061    ///
1062    /// # Safety
1063    ///
1064    /// - `ptr` must either be [valid] for writes of `size_of::<T>()` bytes or be in user
1065    ///   space for `size_of::<T>()` bytes.
1066    /// - `ptr` must be aligned on an `align_of::<T>()`-byte boundary.
1067    ///
1068    /// [valid]: crate::mm::io#safety
1069    #[doc(hidden)]
1070    unsafe fn atomic_cmpxchg_fallible(ptr: *mut Self, old_val: Self, new_val: Self)
1071    -> Result<Self>;
1072}
1073
1074impl PodAtomic for u32 {
1075    unsafe fn atomic_load_fallible(ptr: *const Self) -> Result<Self> {
1076        // SAFETY: The safety is upheld by the caller.
1077        let result = unsafe { __atomic_load_fallible(ptr) };
1078        if result == !0 {
1079            Err(Error::PageFault)
1080        } else {
1081            Ok(result as Self)
1082        }
1083    }
1084
1085    unsafe fn atomic_cmpxchg_fallible(ptr: *mut Self, old_val: Self, new_val: Self) -> Result<u32> {
1086        // SAFETY: The safety is upheld by the caller.
1087        let result = unsafe { __atomic_cmpxchg_fallible(ptr, old_val, new_val) };
1088        if result == !0 {
1089            Err(Error::PageFault)
1090        } else {
1091            Ok(result as Self)
1092        }
1093    }
1094}