Skip to content

perf: Memory Donation Semantics with Rust Ownership Integration #96

@noahgift

Description

@noahgift

Summary

Implement memory donation semantics that allow operations to reuse input buffers for output when the input is no longer needed, mapping naturally to Rust's ownership model.

Inspiration: JAX jax/_src/dispatch.py:322-530 - tracks ownership with REUSE_INPUT, DONATE_INPUT, ALWAYS_COPY modes.

Problem

Current tensor operations always allocate new output buffers:

let a = Tensor::ones(&[1000, 1000]);  // 4MB
let b = a.relu();  // Allocates new 4MB (a still valid but unused)
// a is dropped here, wasting the allocation

This causes:

  • 2x memory usage for in-place-able operations
  • Allocation overhead on every operation
  • Cache pollution from unnecessary copies

Proposed Solution

1. Ownership Tracking

use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;

/// Reference count + ownership state
#[derive(Debug)]
pub struct OwnershipState {
    /// Number of active references
    ref_count: AtomicU32,
    /// Whether this buffer can be donated
    donatable: AtomicBool,
    /// Whether this buffer is exclusively owned
    exclusive: AtomicBool,
}

impl OwnershipState {
    pub fn new() -> Self {
        Self {
            ref_count: AtomicU32::new(1),
            donatable: AtomicBool::new(true),
            exclusive: AtomicBool::new(true),
        }
    }
    
    /// Check if buffer can be reused in-place
    pub fn can_reuse(&self) -> bool {
        self.ref_count.load(Ordering::Acquire) == 1 &&
        self.exclusive.load(Ordering::Acquire)
    }
    
    /// Check if buffer can be donated to another operation
    pub fn can_donate(&self) -> bool {
        self.can_reuse() && self.donatable.load(Ordering::Acquire)
    }
    
    /// Mark as shared (cannot reuse or donate)
    pub fn mark_shared(&self) {
        self.exclusive.store(false, Ordering::Release);
        self.donatable.store(false, Ordering::Release);
    }
    
    /// Increment reference count
    pub fn inc_ref(&self) {
        self.ref_count.fetch_add(1, Ordering::AcqRel);
        // Multiple references = not exclusive
        self.exclusive.store(false, Ordering::Release);
    }
    
    /// Decrement reference count, return true if should free
    pub fn dec_ref(&self) -> bool {
        self.ref_count.fetch_sub(1, Ordering::AcqRel) == 1
    }
}

/// Buffer with ownership tracking
pub struct OwnedBuffer {
    data: *mut u8,
    size: usize,
    state: Arc<OwnershipState>,
    /// Original allocator for deallocation
    allocator: AllocatorId,
}

impl OwnedBuffer {
    /// Try to get exclusive mutable access for in-place operation
    pub fn try_get_mut(&mut self) -> Option<&mut [u8]> {
        if self.state.can_reuse() {
            // SAFETY: We have exclusive access
            Some(unsafe { std::slice::from_raw_parts_mut(self.data, self.size) })
        } else {
            None
        }
    }
    
    /// Donate this buffer to a new tensor (transfers ownership)
    pub fn donate(self) -> Option<OwnedBuffer> {
        if self.state.can_donate() {
            Some(self)
        } else {
            None
        }
    }
}

2. Copy Semantics Enum

/// How to handle input buffer during operation
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CopySemantics {
    /// Reuse input buffer if exclusively owned (in-place mutation)
    ReuseInput,
    /// Take ownership of input buffer (donate to output)
    DonateInput,
    /// Always allocate new buffer (safe default)
    AlwaysCopy,
}

impl Default for CopySemantics {
    fn default() -> Self {
        // Default to safe behavior
        CopySemantics::AlwaysCopy
    }
}

/// Builder for operation configuration
pub struct OpBuilder<'a, T> {
    input: &'a Tensor<T>,
    copy_semantics: CopySemantics,
}

impl<'a, T: TensorElement> OpBuilder<'a, T> {
    pub fn new(input: &'a Tensor<T>) -> Self {
        Self {
            input,
            copy_semantics: CopySemantics::AlwaysCopy,
        }
    }
    
    /// Enable in-place mutation if input is exclusively owned
    pub fn inplace_if_possible(mut self) -> Self {
        if self.input.is_exclusively_owned() {
            self.copy_semantics = CopySemantics::ReuseInput;
        }
        self
    }
    
    /// Donate input buffer to output (input becomes invalid)
    pub fn donate(mut self) -> Self {
        self.copy_semantics = CopySemantics::DonateInput;
        self
    }
    
    /// Execute unary operation with configured semantics
    pub fn apply<F>(self, op: F) -> Tensor<T>
    where
        F: FnOnce(&mut [T]),
    {
        match self.copy_semantics {
            CopySemantics::ReuseInput => {
                // Get mutable access and modify in place
                let mut data = self.input.data_mut().expect("Not exclusively owned");
                op(&mut data);
                self.input.clone()  // Return same tensor
            }
            CopySemantics::DonateInput => {
                // Take the buffer and modify
                let buffer = self.input.take_buffer().expect("Cannot donate");
                let mut data = buffer.as_mut_slice();
                op(&mut data);
                Tensor::from_buffer(buffer, self.input.shape().to_vec())
            }
            CopySemantics::AlwaysCopy => {
                // Allocate new buffer, copy, modify
                let mut output = self.input.clone();
                let mut data = output.data_mut().unwrap();
                op(&mut data);
                output
            }
        }
    }
}

3. Tensor API Integration

impl<T: TensorElement> Tensor<T> {
    /// Check if this tensor exclusively owns its buffer
    pub fn is_exclusively_owned(&self) -> bool {
        self.buffer.state.can_reuse()
    }
    
    /// Get mutable data if exclusively owned
    pub fn data_mut(&mut self) -> Option<&mut [T]> {
        self.buffer.try_get_mut().map(|bytes| {
            // SAFETY: Buffer is properly aligned for T
            unsafe {
                std::slice::from_raw_parts_mut(
                    bytes.as_mut_ptr() as *mut T,
                    bytes.len() / std::mem::size_of::<T>(),
                )
            }
        })
    }
    
    /// Take ownership of buffer (self becomes empty)
    pub fn take_buffer(&mut self) -> Option<OwnedBuffer> {
        if self.buffer.state.can_donate() {
            Some(std::mem::replace(&mut self.buffer, OwnedBuffer::empty()))
        } else {
            None
        }
    }
    
    /// ReLU with optional in-place execution
    pub fn relu(&self) -> Tensor<T> {
        OpBuilder::new(self)
            .inplace_if_possible()
            .apply(|data| {
                for x in data.iter_mut() {
                    if *x < T::zero() {
                        *x = T::zero();
                    }
                }
            })
    }
    
    /// ReLU that always mutates in-place (consumes self)
    pub fn relu_inplace(mut self) -> Tensor<T> {
        if let Some(data) = self.data_mut() {
            for x in data.iter_mut() {
                if *x < T::zero() {
                    *x = T::zero();
                }
            }
            self
        } else {
            // Fallback: copy then mutate
            self.relu()
        }
    }
    
    /// Explicit donate API
    pub fn relu_donate(self) -> Tensor<T> {
        OpBuilder::new(&self)
            .donate()
            .apply(|data| {
                for x in data.iter_mut() {
                    if *x < T::zero() {
                        *x = T::zero();
                    }
                }
            })
    }
}

4. Automatic Donation in Operations

/// Trait for operations that can donate input
pub trait Donatable<T> {
    /// Execute with automatic input donation if beneficial
    fn execute_with_donation(self, input: Tensor<T>) -> Tensor<T>;
}

/// Binary operation with donation support
pub fn binary_op_with_donation<T: TensorElement, F>(
    mut a: Tensor<T>,
    b: &Tensor<T>,
    op: F,
) -> Tensor<T>
where
    F: Fn(T, T) -> T,
{
    // Try to reuse a's buffer if shapes match and a is donatable
    if a.shape() == b.shape() && a.is_exclusively_owned() {
        // In-place on a
        let a_data = a.data_mut().unwrap();
        let b_data = b.as_slice();
        
        for (x, &y) in a_data.iter_mut().zip(b_data.iter()) {
            *x = op(*x, y);
        }
        
        return a;
    }
    
    // Try to reuse b's buffer (if op is commutative and b is donatable)
    // ... similar logic
    
    // Fallback: allocate new
    let mut output = Tensor::zeros(a.shape());
    let out_data = output.data_mut().unwrap();
    let a_data = a.as_slice();
    let b_data = b.as_slice();
    
    for ((o, &x), &y) in out_data.iter_mut().zip(a_data.iter()).zip(b_data.iter()) {
        *o = op(x, y);
    }
    
    output
}

// Add with donation
impl<T: TensorElement> std::ops::Add for Tensor<T> {
    type Output = Tensor<T>;
    
    fn add(self, rhs: Tensor<T>) -> Tensor<T> {
        binary_op_with_donation(self, &rhs, |a, b| a + b)
    }
}

// Add-assign (always in-place if possible)
impl<T: TensorElement> std::ops::AddAssign<&Tensor<T>> for Tensor<T> {
    fn add_assign(&mut self, rhs: &Tensor<T>) {
        if let Some(data) = self.data_mut() {
            let rhs_data = rhs.as_slice();
            for (x, &y) in data.iter_mut().zip(rhs_data.iter()) {
                *x = *x + y;
            }
        } else {
            *self = self.clone() + rhs.clone();
        }
    }
}

5. Compile-Time Donation Hints

/// Marker trait for types that can be donated
pub trait CanDonate {}

/// Wrapper that indicates intention to donate
#[repr(transparent)]
pub struct Donated<T>(pub T);

impl<T> CanDonate for Donated<T> {}

/// Extension trait for donation
pub trait DonateExt: Sized {
    fn donate(self) -> Donated<Self> {
        Donated(self)
    }
}

impl<T: TensorElement> DonateExt for Tensor<T> {}

// Operations can specialize on Donated<Tensor>
impl<T: TensorElement> std::ops::Add<Tensor<T>> for Donated<Tensor<T>> {
    type Output = Tensor<T>;
    
    fn add(self, rhs: Tensor<T>) -> Tensor<T> {
        // Guaranteed to try donation
        binary_op_with_donation(self.0, &rhs, |a, b| a + b)
    }
}

// Usage:
// let result = a.donate() + b;  // Will reuse a's buffer

6. Arena Allocator for Temporary Chains

/// Arena allocator for temporary tensors in a computation
pub struct TensorArena {
    /// Allocated buffers
    buffers: RefCell<Vec<OwnedBuffer>>,
    /// Free list by size class
    free_lists: RefCell<[Vec<OwnedBuffer>; 32]>,  // Size classes: 2^5 to 2^36
    /// Total allocated bytes
    total_bytes: Cell<usize>,
    /// Maximum bytes allowed
    max_bytes: usize,
}

impl TensorArena {
    pub fn new(max_bytes: usize) -> Self {
        Self {
            buffers: RefCell::new(Vec::new()),
            free_lists: RefCell::new(std::array::from_fn(|_| Vec::new())),
            total_bytes: Cell::new(0),
            max_bytes,
        }
    }
    
    /// Allocate a tensor from the arena
    pub fn alloc<T: TensorElement>(&self, shape: &[usize]) -> ArenaTensor<T> {
        let size = shape.iter().product::<usize>() * std::mem::size_of::<T>();
        let size_class = size.next_power_of_two().trailing_zeros() as usize;
        
        // Try free list first
        let buffer = {
            let mut free_lists = self.free_lists.borrow_mut();
            if let Some(buf) = free_lists[size_class].pop() {
                buf
            } else {
                // Allocate new
                OwnedBuffer::allocate(size)
            }
        };
        
        self.total_bytes.set(self.total_bytes.get() + size);
        
        ArenaTensor {
            buffer,
            shape: shape.to_vec(),
            arena: self,
            _marker: PhantomData,
        }
    }
    
    /// Return a buffer to the arena
    fn return_buffer(&self, buffer: OwnedBuffer) {
        let size_class = buffer.size.next_power_of_two().trailing_zeros() as usize;
        self.total_bytes.set(self.total_bytes.get().saturating_sub(buffer.size));
        self.free_lists.borrow_mut()[size_class].push(buffer);
    }
    
    /// Clear all allocations (call at end of computation)
    pub fn clear(&self) {
        self.buffers.borrow_mut().clear();
        for list in self.free_lists.borrow_mut().iter_mut() {
            list.clear();
        }
        self.total_bytes.set(0);
    }
}

/// Tensor allocated from an arena
pub struct ArenaTensor<'a, T> {
    buffer: OwnedBuffer,
    shape: Vec<usize>,
    arena: &'a TensorArena,
    _marker: PhantomData<T>,
}

impl<'a, T> Drop for ArenaTensor<'a, T> {
    fn drop(&mut self) {
        // Return buffer to arena instead of freeing
        self.arena.return_buffer(std::mem::replace(
            &mut self.buffer,
            OwnedBuffer::empty(),
        ));
    }
}

// Usage in forward pass:
// let arena = TensorArena::new(1 << 30);  // 1GB
// let x = arena.alloc::<f32>(&[batch, seq, hidden]);
// let y = arena.alloc::<f32>(&[batch, seq, hidden]);
// // Computation reuses arena memory
// arena.clear();

Acceptance Criteria

  • OwnershipState with atomic ref counting
  • CopySemantics enum with REUSE/DONATE/COPY modes
  • OpBuilder for configuring operation semantics
  • Tensor::data_mut() for exclusive mutable access
  • Donated<T> wrapper for explicit donation
  • Binary operations with automatic donation
  • TensorArena for computation chains
  • Benchmarks showing memory savings
  • Unit tests for ownership correctness

Expected Performance Impact

Operation AlwaysCopy With Donation Memory Saved
ReLU chain (10 ops) 40MB 4MB 90%
ResNet block 200MB 80MB 60%
Transformer layer 500MB 200MB 60%
Full forward pass 2GB 800MB 60%

30-60% memory reduction enabling larger batch sizes or longer sequences.

References

Labels

performance, memory, ownership, P1-high

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions