Skip to main content

ostd/mm/io/
mod.rs

1// SPDX-License-Identifier: MPL-2.0
2
3//! Abstractions for reading and writing virtual memory (VM) objects.
4//!
5//! # Safety
6//!
7//! The core virtual memory (VM) access APIs provided by this module are [`VmReader`] and
8//! [`VmWriter`], which allow for writing to or reading from a region of memory _safely_.
9//! `VmReader` and `VmWriter` objects can be constructed from memory regions of either typed memory
10//! (e.g., `&[u8]`) or untyped memory (e.g, [`UFrame`]). Behind the scene, `VmReader` and `VmWriter`
11//! must be constructed via their [`from_user_space`] and [`from_kernel_space`] methods, whose
12//! safety depends on whether the given memory regions are _valid_ or not.
13//!
14//! [`UFrame`]: crate::mm::UFrame
15//! [`from_user_space`]: `VmReader::from_user_space`
16//! [`from_kernel_space`]: `VmReader::from_kernel_space`
17//!
18//! Here is a list of conditions for memory regions to be considered valid:
19//!
20//! - The memory region as a whole must be either typed or untyped memory, not both typed and
21//!   untyped.
22//!
23//! - If the memory region is typed, we require that:
24//!   - the [validity requirements] from the official Rust documentation must be met, and
25//!   - the type of the memory region (which must exist since the memory is typed) must be
26//!     plain-old-data, so that the writer can fill it with arbitrary data safely.
27//!
28//! [validity requirements]: core::ptr#safety
29//!
30//! - If the memory region is untyped, we require that:
31//!   - the underlying pages must remain alive while the validity requirements are in effect, and
32//!   - the kernel must access the memory region using only the APIs provided in this module, but
33//!     external accesses from hardware devices or user programs do not count.
34//!
35//! We have the last requirement for untyped memory to be valid because the safety interaction with
36//! other ways to access the memory region (e.g., atomic/volatile memory loads/stores) is not
37//! currently specified. Tis may be relaxed in the future, if appropriate and necessary.
38//!
39//! Note that data races on untyped memory are explicitly allowed (since pages can be mapped to
40//! user space, making it impossible to avoid data races). However, they may produce erroneous
41//! results, such as unexpected bytes being copied, but do not cause soundness problems.
42
43pub(crate) mod copy;
44pub mod util;
45
46use core::{marker::PhantomData, mem::MaybeUninit};
47
48use ostd_pod::Pod;
49
50use self::copy::{memcpy, memset};
51use crate::{
52    Error,
53    arch::mm::{__atomic_cmpxchg_fallible, __atomic_load_fallible},
54    mm::{
55        MAX_USERSPACE_VADDR,
56        kspace::{KERNEL_BASE_VADDR, KERNEL_END_VADDR},
57    },
58    prelude::*,
59};
60
61/// A trait that enables reading/writing data from/to a VM object,
62/// e.g., [`USegment`], [`Vec<UFrame>`] and [`UFrame`].
63///
64/// # Concurrency
65///
66/// The methods may be executed by multiple concurrent reader and writer
67/// threads. In this case, if the results of concurrent reads or writes
68/// desire predictability or atomicity, the users should add extra mechanism
69/// for such properties.
70///
71/// [`USegment`]: crate::mm::USegment
72/// [`UFrame`]: crate::mm::UFrame
73pub trait VmIo {
74    /// Reads requested data at a specified offset into a given `VmWriter`.
75    ///
76    /// # No short reads
77    ///
78    /// On success, the `writer` must be written with the requested data
79    /// completely. If, for any reason, the requested data is only partially
80    /// available, then the method shall return an error.
81    fn read(&self, offset: usize, writer: &mut VmWriter) -> Result<()>;
82
83    /// Reads a specified number of bytes at a specified offset into a given buffer.
84    ///
85    /// # No short reads
86    ///
87    /// Similar to [`read`].
88    ///
89    /// [`read`]: VmIo::read
90    fn read_bytes(&self, offset: usize, buf: &mut [u8]) -> Result<()> {
91        let mut writer = VmWriter::from(buf).to_fallible();
92        self.read(offset, &mut writer)
93    }
94
95    /// Reads a value of a specified type at a specified offset.
96    fn read_val<T: Pod>(&self, offset: usize) -> Result<T> {
97        // Why not use `MaybeUninit` for a faster implementation?
98        //
99        // ```rust
100        // let mut val: MaybeUninit<T> = MaybeUninit::uninit();
101        // let writer = unsafe {
102        //     VmWriter::from_kernel_space(val.as_mut_ptr().cast(), size_of::<T>())
103        // };
104        // self.read(offset, &mut writer.to_fallible())?;
105        // Ok(unsafe { val.assume_init() })
106        // ```
107        //
108        // The above implementation avoids initializing `val` upfront,
109        // so it is more efficient than our actual implementation.
110        // Unfortunately, it is unsound.
111        // This is because the `read` method,
112        // which could be implemented outside OSTD and thus is untrusted,
113        // may not really initialize the bits of `val` at all!
114
115        let mut val = T::new_zeroed();
116        self.read_bytes(offset, val.as_mut_bytes())?;
117        Ok(val)
118    }
119
120    /// Reads a slice of a specified type at a specified offset.
121    ///
122    /// # No short reads
123    ///
124    /// Similar to [`read`].
125    ///
126    /// [`read`]: VmIo::read
127    fn read_slice<T: Pod>(&self, offset: usize, slice: &mut [T]) -> Result<()> {
128        let len_in_bytes = size_of_val(slice);
129        let ptr = slice as *mut [T] as *mut u8;
130        // SAFETY: the slice can be transmuted to a writable byte slice since the elements
131        // are all Plain-Old-Data (Pod) types.
132        let buf = unsafe { core::slice::from_raw_parts_mut(ptr, len_in_bytes) };
133        self.read_bytes(offset, buf)
134    }
135
136    /// Writes all data from a given `VmReader` at a specified offset.
137    ///
138    /// # No short writes
139    ///
140    /// On success, the data from the `reader` must be read to the VM object entirely.
141    /// If, for any reason, the input data can only be written partially,
142    /// then the method shall return an error.
143    fn write(&self, offset: usize, reader: &mut VmReader) -> Result<()>;
144
145    /// Writes a specified number of bytes from a given buffer at a specified offset.
146    ///
147    /// # No short writes
148    ///
149    /// Similar to [`write`].
150    ///
151    /// [`write`]: VmIo::write
152    fn write_bytes(&self, offset: usize, buf: &[u8]) -> Result<()> {
153        let mut reader = VmReader::from(buf).to_fallible();
154        self.write(offset, &mut reader)
155    }
156
157    /// Writes a value of a specified type at a specified offset.
158    fn write_val<T: Pod>(&self, offset: usize, new_val: &T) -> Result<()> {
159        self.write_bytes(offset, new_val.as_bytes())?;
160        Ok(())
161    }
162
163    /// Writes a slice of a specified type at a specified offset.
164    ///
165    /// # No short write
166    ///
167    /// Similar to [`write`].
168    ///
169    /// [`write`]: VmIo::write
170    fn write_slice<T: Pod>(&self, offset: usize, slice: &[T]) -> Result<()> {
171        let len_in_bytes = size_of_val(slice);
172        let ptr = slice as *const [T] as *const u8;
173        // SAFETY: the slice can be transmuted to a readable byte slice since the elements
174        // are all Plain-Old-Data (Pod) types.
175        let buf = unsafe { core::slice::from_raw_parts(ptr, len_in_bytes) };
176        self.write_bytes(offset, buf)
177    }
178}
179
180/// A trait that enables filling bytes (e.g., filling zeros) to a VM object.
181pub trait VmIoFill {
182    /// Writes `len` zeros at a specified offset.
183    ///
184    /// Unlike the methods in [`VmIo`], this method allows for short writes because `len` can be
185    /// effectively unbounded. However, if not all bytes can be written successfully, an `Err(_)`
186    /// will be returned with the error and the number of zeros that have been written thus far.
187    ///
188    /// # A slow, general implementation
189    ///
190    /// Suppose that [`VmIo`] has already been implemented for the type,
191    /// this method can be implemented in the following general way.
192    ///
193    /// ```rust
194    /// fn fill_zeros(&self, offset: usize, len: usize) -> Result<(), (Error, usize)> {
195    ///     for i in 0..len {
196    ///         match self.write_slice(offset + i, &[0u8]) {
197    ///             Ok(()) => continue,
198    ///             Err(err) => return Err((err, i)),
199    ///         }
200    ///     }
201    ///     Ok(())
202    /// }
203    /// ```
204    ///
205    /// But we choose not to provide a general, default implementation
206    /// because doing so would make it too easy for a concrete type of `VmIoFill`
207    /// to settle with a slower implementation for such a performance-sensitive operation.
208    fn fill_zeros(&self, offset: usize, len: usize) -> Result<(), (Error, usize)>;
209}
210
211/// A trait that enables reading/writing data from/to a VM object using one non-tearing memory
212/// load/store.
213///
214/// See also [`VmIo`], which enables reading/writing data from/to a VM object without the guarantee
215/// of using one non-tearing memory load/store.
216pub trait VmIoOnce {
217    /// Reads a value of the `PodOnce` type at the specified offset using one non-tearing memory
218    /// load.
219    ///
220    /// Except that the offset is specified explicitly, the semantics of this method is the same as
221    /// [`VmReader::read_once`].
222    fn read_once<T: PodOnce>(&self, offset: usize) -> Result<T>;
223
224    /// Writes a value of the `PodOnce` type at the specified offset using one non-tearing memory
225    /// store.
226    ///
227    /// Except that the offset is specified explicitly, the semantics of this method is the same as
228    /// [`VmWriter::write_once`].
229    fn write_once<T: PodOnce>(&self, offset: usize, new_val: &T) -> Result<()>;
230}
231
232/// A marker type used for _fallible_ memory,
233/// where memory access _might_ trigger page faults.
234///
235/// The most prominent example of fallible memory is user virtual memory.
236///
237/// By definition, infallible memory is a subset of fallible memory.
238/// As a consequence, any code that intends to work with fallible memory
239/// should work for both user virtual memory and kernel virtual memory.
240///
241/// [`VmReader`] and [`VmWriter`] types use this marker type
242/// to indicate the property of the underlying memory.
243pub enum Fallible {}
244
245/// A marker type used for _infallible_ memory,
246/// where memory access is valid and won't trigger page faults.
247///
248/// The most prominent example of infallible memory is kernel virtual memory
249/// (at least for the part where Rust code and data reside).
250///
251/// [`VmReader`] and [`VmWriter`] types use this marker type
252/// to indicate the property of the underlying memory.
253pub enum Infallible {}
254
255/// A marker type for I/O memory regions.
256///
257/// This marker is used by [`memcpy`] and [`memset`]
258/// to indicate that a source or destination operand
259/// resides in I/O memory (MMIO).
260///
261/// Unlike [`Fallible`] and [`Infallible`],
262/// `Io` cannot statically determine
263/// whether a memory access will fault:
264/// MMIO fallibility is platform-dependent.
265/// For example, on Intel TDX
266/// every MMIO access triggers a #VE exception,
267/// whereas on a non-CVM x86 host
268/// the same access completes without faulting.
269pub(crate) enum Io {}
270
271/// Fallible memory read from a `VmWriter`.
272pub trait FallibleVmRead<F> {
273    /// Reads all data into the writer until one of the three conditions is met:
274    /// 1. The reader has no remaining data.
275    /// 2. The writer has no available space.
276    /// 3. The reader/writer encounters some error.
277    ///
278    /// On success, the number of bytes read is returned;
279    /// On error, both the error and the number of bytes read so far are returned.
280    fn read_fallible(&mut self, writer: &mut VmWriter<'_, F>) -> Result<usize, (Error, usize)>;
281}
282
283/// Fallible memory write from a `VmReader`.
284pub trait FallibleVmWrite<F> {
285    /// Writes all data from the reader until one of the three conditions is met:
286    /// 1. The reader has no remaining data.
287    /// 2. The writer has no available space.
288    /// 3. The reader/writer encounters some error.
289    ///
290    /// On success, the number of bytes written is returned;
291    /// On error, both the error and the number of bytes written so far are returned.
292    fn write_fallible(&mut self, reader: &mut VmReader<'_, F>) -> Result<usize, (Error, usize)>;
293}
294
295/// `VmReader` is a reader for reading data from a contiguous range of memory.
296///
297/// The memory range read by `VmReader` can be in either kernel space or user space.
298/// When the operating range is in kernel space, the memory within that range
299/// is guaranteed to be valid, and the corresponding memory reads are infallible.
300/// When the operating range is in user space, it is ensured that the page table of
301/// the process creating the `VmReader` is active for the duration of `'a`,
302/// and the corresponding memory reads are considered fallible.
303///
304/// When perform reading with a `VmWriter`, if one of them represents typed memory,
305/// it can ensure that the reading range in this reader and writing range in the
306/// writer are not overlapped.
307///
308/// NOTE: The overlap mentioned above is at both the virtual address level
309/// and physical address level. There is not guarantee for the operation results
310/// of `VmReader` and `VmWriter` in overlapping untyped addresses, and it is
311/// the user's responsibility to handle this situation.
312pub struct VmReader<'a, Fallibility = Fallible> {
313    cursor: *const u8,
314    end: *const u8,
315    phantom: PhantomData<(&'a [u8], Fallibility)>,
316}
317
318// `Clone` can be implemented for `VmReader`
319// because it either points to untyped memory or represents immutable references.
320// Note that we cannot implement `Clone` for `VmWriter`
321// because it can represent mutable references, which must remain exclusive.
322impl<Fallibility> Clone for VmReader<'_, Fallibility> {
323    fn clone(&self) -> Self {
324        Self {
325            cursor: self.cursor,
326            end: self.end,
327            phantom: PhantomData,
328        }
329    }
330}
331
332macro_rules! impl_read_fallible {
333    ($reader_fallibility:ty, $writer_fallibility:ty) => {
334        impl<'a> FallibleVmRead<$writer_fallibility> for VmReader<'a, $reader_fallibility> {
335            fn read_fallible(
336                &mut self,
337                writer: &mut VmWriter<'_, $writer_fallibility>,
338            ) -> Result<usize, (Error, usize)> {
339                let copy_len = self.remain().min(writer.avail());
340                if copy_len == 0 {
341                    return Ok(0);
342                }
343
344                // SAFETY: The source and destination are subsets of memory ranges specified by
345                // the reader and writer, so they are either valid for reading and writing or in
346                // user space.
347                let copied_len = unsafe {
348                    memcpy::<$writer_fallibility, $reader_fallibility>(
349                        writer.cursor,
350                        self.cursor,
351                        copy_len,
352                    )
353                };
354                self.cursor = self.cursor.wrapping_add(copied_len);
355                writer.cursor = writer.cursor.wrapping_add(copied_len);
356
357                if copied_len < copy_len {
358                    Err((Error::PageFault, copied_len))
359                } else {
360                    Ok(copied_len)
361                }
362            }
363        }
364    };
365}
366
367macro_rules! impl_write_fallible {
368    ($writer_fallibility:ty, $reader_fallibility:ty) => {
369        impl<'a> FallibleVmWrite<$reader_fallibility> for VmWriter<'a, $writer_fallibility> {
370            fn write_fallible(
371                &mut self,
372                reader: &mut VmReader<'_, $reader_fallibility>,
373            ) -> Result<usize, (Error, usize)> {
374                reader.read_fallible(self)
375            }
376        }
377    };
378}
379
380impl_read_fallible!(Fallible, Infallible);
381impl_read_fallible!(Fallible, Fallible);
382impl_read_fallible!(Infallible, Fallible);
383impl_write_fallible!(Fallible, Infallible);
384impl_write_fallible!(Fallible, Fallible);
385impl_write_fallible!(Infallible, Fallible);
386
387impl<'a> VmReader<'a, Infallible> {
388    /// Constructs a `VmReader` from a pointer and a length, which represents
389    /// a memory range in kernel space.
390    ///
391    /// # Safety
392    ///
393    /// `ptr` must be [valid] for reads of `len` bytes during the entire lifetime `a`.
394    ///
395    /// [valid]: crate::mm::io#safety
396    pub unsafe fn from_kernel_space(ptr: *const u8, len: usize) -> Self {
397        // Rust is allowed to give the reference to a zero-sized object a very small address,
398        // falling out of the kernel virtual address space range.
399        // So when `len` is zero, we should not and need not to check `ptr`.
400        debug_assert!(len == 0 || KERNEL_BASE_VADDR <= ptr.addr());
401        debug_assert!(len == 0 || ptr.addr().checked_add(len).unwrap() <= KERNEL_END_VADDR);
402
403        Self {
404            cursor: ptr,
405            end: ptr.wrapping_add(len),
406            phantom: PhantomData,
407        }
408    }
409
410    /// Reads all data into the writer until one of the two conditions is met:
411    /// 1. The reader has no remaining data.
412    /// 2. The writer has no available space.
413    ///
414    /// Returns the number of bytes read.
415    pub fn read(&mut self, writer: &mut VmWriter<'_, Infallible>) -> usize {
416        let copy_len = self.remain().min(writer.avail());
417        if copy_len == 0 {
418            return 0;
419        }
420
421        // SAFETY: The source and destination are subsets of memory ranges specified by the reader
422        // and writer, so they are valid for reading and writing.
423        unsafe { memcpy::<Infallible, Infallible>(writer.cursor, self.cursor, copy_len) };
424        self.cursor = self.cursor.wrapping_add(copy_len);
425        writer.cursor = writer.cursor.wrapping_add(copy_len);
426
427        copy_len
428    }
429
430    /// Reads a value of `Pod` type.
431    ///
432    /// If the length of the `Pod` type exceeds `self.remain()`,
433    /// this method will return `Err`.
434    pub fn read_val<T: Pod>(&mut self) -> Result<T> {
435        if self.remain() < size_of::<T>() {
436            return Err(Error::InvalidArgs);
437        }
438
439        let cursor = self.cursor.cast::<T>();
440
441        // SAFETY: We have checked that the number of bytes remaining is at least the size of `T`.
442        // All other safety requirements are the same as for `Self::read`.
443        let val = unsafe { core::intrinsics::unaligned_volatile_load(cursor) };
444        self.cursor = self.cursor.wrapping_add(size_of::<T>());
445
446        Ok(val)
447    }
448
449    /// Reads a value of the `PodOnce` type using one non-tearing memory load.
450    ///
451    /// If the length of the `PodOnce` type exceeds `self.remain()`, this method will return `Err`.
452    ///
453    /// This method will not compile if the `Pod` type is too large for the current architecture
454    /// and the operation must be tear into multiple memory loads.
455    ///
456    /// # Panics
457    ///
458    /// This method will panic if the current position of the reader does not meet the alignment
459    /// requirements of type `T`.
460    pub fn read_once<T: PodOnce>(&mut self) -> Result<T> {
461        if self.remain() < size_of::<T>() {
462            return Err(Error::InvalidArgs);
463        }
464
465        let cursor = self.cursor.cast::<T>();
466        assert!(cursor.is_aligned());
467
468        const { assert!(pod_once_impls::is_non_tearing::<T>()) };
469
470        // SAFETY: We have checked that the number of bytes remaining is at least the size of `T`
471        // and that the cursor is properly aligned with respect to the type `T`. All other safety
472        // requirements are the same as for `Self::read`.
473        let val = unsafe { cursor.read_volatile() };
474        self.cursor = self.cursor.wrapping_add(size_of::<T>());
475
476        Ok(val)
477    }
478
479    // Currently, there are no volatile atomic operations in `core::intrinsics`. Therefore, we do
480    // not provide an infallible implementation of `VmReader::atomic_load`.
481
482    /// Converts to a fallible reader.
483    pub fn to_fallible(self) -> VmReader<'a, Fallible> {
484        // It is safe to construct a fallible reader since an infallible reader covers the
485        // capabilities of a fallible reader.
486        VmReader {
487            cursor: self.cursor,
488            end: self.end,
489            phantom: PhantomData,
490        }
491    }
492}
493
494impl VmReader<'_, Fallible> {
495    /// Constructs a `VmReader` from a pointer and a length, which represents
496    /// a memory range in user space.
497    ///
498    /// # Safety
499    ///
500    /// The virtual address range `ptr..ptr + len` must be in user space.
501    pub unsafe fn from_user_space(ptr: *const u8, len: usize) -> Self {
502        debug_assert!(ptr.addr().checked_add(len).unwrap() <= MAX_USERSPACE_VADDR);
503
504        Self {
505            cursor: ptr,
506            end: ptr.wrapping_add(len),
507            phantom: PhantomData,
508        }
509    }
510
511    /// Reads a value of `Pod` type.
512    ///
513    /// If the length of the `Pod` type exceeds `self.remain()`,
514    /// or the value can not be read completely,
515    /// this method will return `Err`.
516    ///
517    /// If the memory read failed, this method will return `Err`
518    /// and the current reader's cursor remains pointing to
519    /// the original starting position.
520    pub fn read_val<T: Pod>(&mut self) -> Result<T> {
521        if self.remain() < size_of::<T>() {
522            return Err(Error::InvalidArgs);
523        }
524
525        let mut val = MaybeUninit::<T>::uninit();
526
527        // SAFETY:
528        // - The memory range points to typed memory.
529        // - The validity requirements for write accesses are met because the pointer is converted
530        //   from a mutable pointer where the underlying storage outlives the temporary lifetime
531        //   and no other Rust references to the same storage exist during the lifetime.
532        // - The type, i.e., `T`, is plain-old-data.
533        let mut writer =
534            unsafe { VmWriter::from_kernel_space(val.as_mut_ptr().cast(), size_of::<T>()) };
535        self.read_fallible(&mut writer)
536            .map_err(|(err, copied_len)| {
537                // The `copied_len` is the number of bytes read so far.
538                // So the `cursor` can be moved back to the original position.
539                self.cursor = self.cursor.wrapping_sub(copied_len);
540                err
541            })?;
542        debug_assert!(!writer.has_avail());
543
544        // SAFETY:
545        // - `self.read_fallible` has initialized all the bytes in `val`.
546        // - The type is plain-old-data.
547        let val_inited = unsafe { val.assume_init() };
548        Ok(val_inited)
549    }
550
551    /// Atomically loads a `PodAtomic` value.
552    ///
553    /// Regardless of whether it is successful, the cursor of the reader will not move.
554    ///
555    /// This method only guarantees the atomicity of the specific operation. There are no
556    /// synchronization constraints on other memory accesses. This aligns with the [Relaxed
557    /// ordering](https://en.cppreference.com/w/cpp/atomic/memory_order.html#Relaxed_ordering)
558    /// specified in the C++11 memory model.
559    ///
560    /// This method will fail with errors if
561    ///  1. the remaining space of the reader is less than `size_of::<T>()` bytes, or
562    ///  2. the memory operation fails due to an unresolvable page fault.
563    ///
564    /// # Panics
565    ///
566    /// This method will panic if the memory location is not aligned on an `align_of::<T>()`-byte
567    /// boundary.
568    pub fn atomic_load<T: PodAtomic>(&self) -> Result<T> {
569        if self.remain() < size_of::<T>() {
570            return Err(Error::InvalidArgs);
571        }
572
573        let cursor = self.cursor.cast::<T>();
574        assert!(cursor.is_aligned());
575
576        // SAFETY:
577        // 1. The cursor is either valid for reading or in user space for `size_of::<T>()` bytes.
578        // 2. The cursor is aligned on an `align_of::<T>()`-byte boundary.
579        unsafe { T::atomic_load_fallible(cursor) }
580    }
581}
582
583impl<Fallibility> VmReader<'_, Fallibility> {
584    /// Returns the number of bytes for the remaining data.
585    pub fn remain(&self) -> usize {
586        self.end.addr() - self.cursor.addr()
587    }
588
589    /// Returns the cursor pointer, which refers to the address of the next byte to read.
590    pub fn cursor(&self) -> *const u8 {
591        self.cursor
592    }
593
594    /// Returns if it has remaining data to read.
595    pub fn has_remain(&self) -> bool {
596        self.remain() > 0
597    }
598
599    /// Limits the length of remaining data.
600    ///
601    /// This method ensures the post condition of `self.remain() <= max_remain`.
602    pub fn limit(&mut self, max_remain: usize) -> &mut Self {
603        if max_remain < self.remain() {
604            self.end = self.cursor.wrapping_add(max_remain);
605        }
606
607        self
608    }
609
610    /// Skips the first `nbytes` bytes of data.
611    /// The length of remaining data is decreased accordingly.
612    ///
613    /// # Panics
614    ///
615    /// If `nbytes` is greater than `self.remain()`, then the method panics.
616    pub fn skip(&mut self, nbytes: usize) -> &mut Self {
617        assert!(nbytes <= self.remain());
618        self.cursor = self.cursor.wrapping_add(nbytes);
619
620        self
621    }
622}
623
624impl<'a> From<&'a [u8]> for VmReader<'a, Infallible> {
625    fn from(slice: &'a [u8]) -> Self {
626        // SAFETY:
627        // - The memory range points to typed memory.
628        // - The validity requirements for read accesses are met because the pointer is converted
629        //   from an immutable reference that outlives the lifetime `'a`.
630        // - The type, i.e., the `u8` slice, is plain-old-data.
631        unsafe { Self::from_kernel_space(slice.as_ptr(), slice.len()) }
632    }
633}
634
635/// `VmWriter` is a writer for writing data to a contiguous range of memory.
636///
637/// The memory range write by `VmWriter` can be in either kernel space or user space.
638/// When the operating range is in kernel space, the memory within that range
639/// is guaranteed to be valid, and the corresponding memory writes are infallible.
640/// When the operating range is in user space, it is ensured that the page table of
641/// the process creating the `VmWriter` is active for the duration of `'a`,
642/// and the corresponding memory writes are considered fallible.
643///
644/// When perform writing with a `VmReader`, if one of them represents typed memory,
645/// it can ensure that the writing range in this writer and reading range in the
646/// reader are not overlapped.
647///
648/// NOTE: The overlap mentioned above is at both the virtual address level
649/// and physical address level. There is not guarantee for the operation results
650/// of `VmReader` and `VmWriter` in overlapping untyped addresses, and it is
651/// the user's responsibility to handle this situation.
652pub struct VmWriter<'a, Fallibility = Fallible> {
653    cursor: *mut u8,
654    end: *mut u8,
655    phantom: PhantomData<(&'a mut [u8], Fallibility)>,
656}
657
658impl<'a> VmWriter<'a, Infallible> {
659    /// Constructs a `VmWriter` from a pointer and a length, which represents
660    /// a memory range in kernel space.
661    ///
662    /// # Safety
663    ///
664    /// `ptr` must be [valid] for writes of `len` bytes during the entire lifetime `a`.
665    ///
666    /// [valid]: crate::mm::io#safety
667    pub unsafe fn from_kernel_space(ptr: *mut u8, len: usize) -> Self {
668        // If casting a zero sized slice to a pointer, the pointer may be null
669        // and does not reside in our kernel space range.
670        debug_assert!(len == 0 || KERNEL_BASE_VADDR <= ptr.addr());
671        debug_assert!(len == 0 || ptr.addr().checked_add(len).unwrap() <= KERNEL_END_VADDR);
672
673        Self {
674            cursor: ptr,
675            end: ptr.wrapping_add(len),
676            phantom: PhantomData,
677        }
678    }
679
680    /// Writes all data from the reader until one of the two conditions is met:
681    /// 1. The reader has no remaining data.
682    /// 2. The writer has no available space.
683    ///
684    /// Returns the number of bytes written.
685    pub fn write(&mut self, reader: &mut VmReader<'_, Infallible>) -> usize {
686        reader.read(self)
687    }
688
689    /// Writes a value of `Pod` type.
690    ///
691    /// If the length of the `Pod` type exceeds `self.avail()`,
692    /// this method will return `Err`.
693    pub fn write_val<T: Pod>(&mut self, new_val: &T) -> Result<()> {
694        if self.avail() < size_of::<T>() {
695            return Err(Error::InvalidArgs);
696        }
697
698        let cursor = self.cursor.cast::<T>();
699
700        // SAFETY: We have checked that the number of bytes remaining is at least the size of `T`.
701        // All other safety requirements are the same as for `Self::write`.
702        unsafe { core::intrinsics::unaligned_volatile_store(cursor, *new_val) };
703        self.cursor = self.cursor.wrapping_add(size_of::<T>());
704
705        Ok(())
706    }
707
708    /// Writes a value of the `PodOnce` type using one non-tearing memory store.
709    ///
710    /// If the length of the `PodOnce` type exceeds `self.remain()`, this method will return `Err`.
711    ///
712    /// # Panics
713    ///
714    /// This method will panic if the current position of the writer does not meet the alignment
715    /// requirements of type `T`.
716    pub fn write_once<T: PodOnce>(&mut self, new_val: &T) -> Result<()> {
717        if self.avail() < size_of::<T>() {
718            return Err(Error::InvalidArgs);
719        }
720
721        let cursor = self.cursor.cast::<T>();
722        assert!(cursor.is_aligned());
723
724        const { assert!(pod_once_impls::is_non_tearing::<T>()) };
725
726        // SAFETY: We have checked that the number of bytes remaining is at least the size of `T`
727        // and that the cursor is properly aligned with respect to the type `T`. All other safety
728        // requirements are the same as for `Self::write`.
729        unsafe { cursor.write_volatile(*new_val) };
730        self.cursor = self.cursor.wrapping_add(size_of::<T>());
731
732        Ok(())
733    }
734
735    // Currently, there are no volatile atomic operations in `core::intrinsics`. Therefore, we do
736    // not provide an infallible implementation of `VmWriter::atomic_compare_exchange`.
737
738    /// Writes `len` zeros to the target memory.
739    ///
740    /// This method attempts to fill up to `len` bytes with zeros. If the available
741    /// memory from the current cursor position is less than `len`, it will only fill
742    /// the available space.
743    pub fn fill_zeros(&mut self, len: usize) -> usize {
744        let len_to_set = self.avail().min(len);
745        if len_to_set == 0 {
746            return 0;
747        }
748
749        // SAFETY: The destination is a subset of the memory range specified by
750        // the current writer, so it is valid for writing.
751        unsafe { memset::<Infallible>(self.cursor, 0u8, len_to_set) };
752        self.cursor = self.cursor.wrapping_add(len_to_set);
753
754        len_to_set
755    }
756
757    /// Converts to a fallible writer.
758    pub fn to_fallible(self) -> VmWriter<'a, Fallible> {
759        // It is safe to construct a fallible reader since an infallible reader covers the
760        // capabilities of a fallible reader.
761        VmWriter {
762            cursor: self.cursor,
763            end: self.end,
764            phantom: PhantomData,
765        }
766    }
767}
768
769impl VmWriter<'_, Fallible> {
770    /// Constructs a `VmWriter` from a pointer and a length, which represents
771    /// a memory range in user space.
772    ///
773    /// The current context should be consistently associated with valid user space during the
774    /// entire lifetime `'a`. This is for correct semantics and is not a safety requirement.
775    ///
776    /// # Safety
777    ///
778    /// `ptr` must be in user space for `len` bytes.
779    pub unsafe fn from_user_space(ptr: *mut u8, len: usize) -> Self {
780        debug_assert!(ptr.addr().checked_add(len).unwrap() <= MAX_USERSPACE_VADDR);
781
782        Self {
783            cursor: ptr,
784            end: ptr.wrapping_add(len),
785            phantom: PhantomData,
786        }
787    }
788
789    /// Writes a value of `Pod` type.
790    ///
791    /// If the length of the `Pod` type exceeds `self.avail()`,
792    /// or the value can not be write completely,
793    /// this method will return `Err`.
794    ///
795    /// If the memory write failed, this method will return `Err`
796    /// and the current writer's cursor remains pointing to
797    /// the original starting position.
798    pub fn write_val<T: Pod>(&mut self, new_val: &T) -> Result<()> {
799        if self.avail() < size_of::<T>() {
800            return Err(Error::InvalidArgs);
801        }
802
803        let mut reader = VmReader::from(new_val.as_bytes());
804        self.write_fallible(&mut reader)
805            .map_err(|(err, copied_len)| {
806                // The `copied_len` is the number of bytes written so far.
807                // So the `cursor` can be moved back to the original position.
808                self.cursor = self.cursor.wrapping_sub(copied_len);
809                err
810            })?;
811        Ok(())
812    }
813
814    /// Atomically compares and exchanges a `PodAtomic` value.
815    ///
816    /// This method compares `old_val` with the value pointed by `self` and, if they are equal,
817    /// updates it with `new_val`.
818    ///
819    /// The value that was previously in memory will be returned, along with a boolean denoting
820    /// whether the compare-and-exchange succeeds. The caller usually wants to retry if this
821    /// flag is false, passing the most recent value that was returned by this method.
822    ///
823    /// The caller is required to provide a reader which points to the exact same memory location
824    /// to ensure that reading from the memory is allowed.
825    ///
826    /// Regardless of whether it is successful, the cursors of the reader and writer will not move.
827    ///
828    /// This method only guarantees the atomicity of the specific operation. There are no
829    /// synchronization constraints on other memory accesses. This aligns with the [Relaxed
830    /// ordering](https://en.cppreference.com/w/cpp/atomic/memory_order.html#Relaxed_ordering)
831    /// specified in the C++11 memory model.
832    ///
833    /// Since the operation does not involve memory locks, it can't prevent the [ABA
834    /// problem](https://en.wikipedia.org/wiki/ABA_problem).
835    ///
836    /// This method will fail with errors if:
837    ///  1. the remaining space of the reader or the available space of the writer are less than
838    ///     `size_of::<T>()` bytes, or
839    ///  2. the memory operation fails due to an unresolvable page fault.
840    ///
841    /// # Panics
842    ///
843    /// This method will panic if:
844    ///  1. the reader and the writer does not point to the same memory location, or
845    ///  2. the memory location is not aligned on an `align_of::<T>()`-byte boundary.
846    pub fn atomic_compare_exchange<T>(
847        &self,
848        reader: &VmReader,
849        old_val: T,
850        new_val: T,
851    ) -> Result<(T, bool)>
852    where
853        T: PodAtomic + Eq,
854    {
855        if self.avail() < size_of::<T>() || reader.remain() < size_of::<T>() {
856            return Err(Error::InvalidArgs);
857        }
858
859        assert_eq!(self.cursor.cast_const(), reader.cursor);
860
861        let cursor = self.cursor.cast::<T>();
862        assert!(cursor.is_aligned());
863
864        // SAFETY:
865        // 1. The cursor is either valid for reading and writing or in user space for
866        //    `size_of::<T>()` bytes.
867        // 2. The cursor is aligned on an `align_of::<T>()`-byte boundary.
868        let cur_val = unsafe { T::atomic_cmpxchg_fallible(cursor, old_val, new_val)? };
869
870        Ok((cur_val, old_val == cur_val))
871    }
872
873    /// Writes `len` zeros to the target memory.
874    ///
875    /// This method attempts to fill up to `len` bytes with zeros. If the available
876    /// memory from the current cursor position is less than `len`, it will only fill
877    /// the available space.
878    ///
879    /// If the memory write failed due to an unresolvable page fault, this method
880    /// will return `Err` with the length set so far.
881    pub fn fill_zeros(&mut self, len: usize) -> Result<usize, (Error, usize)> {
882        let len_to_set = self.avail().min(len);
883        if len_to_set == 0 {
884            return Ok(0);
885        }
886
887        // SAFETY: The destination is a subset of the memory range specified by
888        // the current writer, so it is either valid for writing or in user space.
889        let set_len = unsafe { memset::<Fallible>(self.cursor, 0u8, len_to_set) };
890        self.cursor = self.cursor.wrapping_add(set_len);
891
892        if set_len < len_to_set {
893            Err((Error::PageFault, set_len))
894        } else {
895            Ok(len_to_set)
896        }
897    }
898}
899
900impl<Fallibility> VmWriter<'_, Fallibility> {
901    /// Returns the number of bytes for the available space.
902    pub fn avail(&self) -> usize {
903        self.end.addr() - self.cursor.addr()
904    }
905
906    /// Returns the cursor pointer, which refers to the address of the next byte to write.
907    pub fn cursor(&self) -> *mut u8 {
908        self.cursor
909    }
910
911    /// Returns if it has available space to write.
912    pub fn has_avail(&self) -> bool {
913        self.avail() > 0
914    }
915
916    /// Limits the length of available space.
917    ///
918    /// This method ensures the post condition of `self.avail() <= max_avail`.
919    pub fn limit(&mut self, max_avail: usize) -> &mut Self {
920        if max_avail < self.avail() {
921            self.end = self.cursor.wrapping_add(max_avail);
922        }
923
924        self
925    }
926
927    /// Skips the first `nbytes` bytes of data.
928    /// The length of available space is decreased accordingly.
929    ///
930    /// # Panics
931    ///
932    /// If `nbytes` is greater than `self.avail()`, then the method panics.
933    pub fn skip(&mut self, nbytes: usize) -> &mut Self {
934        assert!(nbytes <= self.avail());
935        self.cursor = self.cursor.wrapping_add(nbytes);
936
937        self
938    }
939
940    /// Creates a clone of this writer, requiring exclusive access.
941    ///
942    /// This method is analogous to [`Clone::clone`], but takes `&mut self`
943    /// instead of `&self`. The `&mut self` receiver is necessary because
944    /// `VmWriter` cannot safely implement `Clone`:
945    /// the underlying buffer may be a mutable slice,
946    /// and two concurrent writers would violate Rust's aliasing rules.
947    ///
948    /// The returned writer has the same cursor position and limit as `self`.
949    /// Because it borrows `self` mutably,
950    /// the original writer cannot be used until the returned writer is dropped.
951    ///
952    /// Note that writes through the returned writer
953    /// do **not** advance the cursor of the original writer.
954    pub fn clone_exclusive(&mut self) -> VmWriter<'_, Fallibility> {
955        VmWriter {
956            cursor: self.cursor,
957            end: self.end,
958            phantom: PhantomData,
959        }
960    }
961}
962
963impl<'a> From<&'a mut [u8]> for VmWriter<'a, Infallible> {
964    fn from(slice: &'a mut [u8]) -> Self {
965        // SAFETY:
966        // - The memory range points to typed memory.
967        // - The validity requirements for write accesses are met because the pointer is converted
968        //   from a mutable reference that outlives the lifetime `'a`.
969        // - The type, i.e., the `u8` slice, is plain-old-data.
970        unsafe { Self::from_kernel_space(slice.as_mut_ptr(), slice.len()) }
971    }
972}
973
974/// A marker trait for POD types that can be read or written with one instruction.
975///
976/// This trait is mostly a hint, since it's safe and can be implemented for _any_ POD type. If it
977/// is implemented for a type that cannot be read or written with a single instruction, calling
978/// `read_once`/`write_once` will lead to a failed compile-time assertion.
979pub trait PodOnce: Pod {}
980
981#[cfg(any(
982    target_arch = "x86_64",
983    target_arch = "riscv64",
984    target_arch = "loongarch64"
985))]
986mod pod_once_impls {
987    use super::PodOnce;
988
989    impl PodOnce for u8 {}
990    impl PodOnce for u16 {}
991    impl PodOnce for u32 {}
992    impl PodOnce for u64 {}
993    impl PodOnce for usize {}
994    impl PodOnce for i8 {}
995    impl PodOnce for i16 {}
996    impl PodOnce for i32 {}
997    impl PodOnce for i64 {}
998    impl PodOnce for isize {}
999
1000    /// Checks whether the memory operation created by `ptr::read_volatile` and
1001    /// `ptr::write_volatile` doesn't tear.
1002    ///
1003    /// Note that the Rust documentation makes no such guarantee, and even the wording in the LLVM
1004    /// LangRef is ambiguous. But this is unlikely to break in practice because the Linux kernel
1005    /// also uses "volatile" semantics to implement `READ_ONCE`/`WRITE_ONCE`.
1006    pub(super) const fn is_non_tearing<T>() -> bool {
1007        let size = size_of::<T>();
1008
1009        size == 1 || size == 2 || size == 4 || size == 8
1010    }
1011}
1012
1013/// A marker trait for POD types that can be read or written atomically.
1014pub trait PodAtomic: Pod {
1015    /// Atomically loads a value.
1016    /// This function will return errors if encountering an unresolvable page fault.
1017    ///
1018    /// Returns the loaded value.
1019    ///
1020    /// # Safety
1021    ///
1022    /// - `ptr` must either be [valid] for writes of `size_of::<T>()` bytes or be in user
1023    ///   space for `size_of::<T>()` bytes.
1024    /// - `ptr` must be aligned on an `align_of::<T>()`-byte boundary.
1025    ///
1026    /// [valid]: crate::mm::io#safety
1027    #[doc(hidden)]
1028    unsafe fn atomic_load_fallible(ptr: *const Self) -> Result<Self>;
1029
1030    /// Atomically compares and exchanges a value.
1031    /// This function will return errors if encountering an unresolvable page fault.
1032    ///
1033    /// Returns the previous value.
1034    /// `new_val` will be written if and only if the previous value is equal to `old_val`.
1035    ///
1036    /// # Safety
1037    ///
1038    /// - `ptr` must either be [valid] for writes of `size_of::<T>()` bytes or be in user
1039    ///   space for `size_of::<T>()` bytes.
1040    /// - `ptr` must be aligned on an `align_of::<T>()`-byte boundary.
1041    ///
1042    /// [valid]: crate::mm::io#safety
1043    #[doc(hidden)]
1044    unsafe fn atomic_cmpxchg_fallible(ptr: *mut Self, old_val: Self, new_val: Self)
1045    -> Result<Self>;
1046}
1047
1048impl PodAtomic for u32 {
1049    unsafe fn atomic_load_fallible(ptr: *const Self) -> Result<Self> {
1050        // SAFETY: The safety is upheld by the caller.
1051        let result = unsafe { __atomic_load_fallible(ptr) };
1052        if result == !0 {
1053            Err(Error::PageFault)
1054        } else {
1055            Ok(result as Self)
1056        }
1057    }
1058
1059    unsafe fn atomic_cmpxchg_fallible(ptr: *mut Self, old_val: Self, new_val: Self) -> Result<u32> {
1060        // SAFETY: The safety is upheld by the caller.
1061        let result = unsafe { __atomic_cmpxchg_fallible(ptr, old_val, new_val) };
1062        if result == !0 {
1063            Err(Error::PageFault)
1064        } else {
1065            Ok(result as Self)
1066        }
1067    }
1068}