-
Notifications
You must be signed in to change notification settings - Fork 1
Open
Description
Summary
Implement memory donation semantics that allow operations to reuse input buffers for output when the input is no longer needed, mapping naturally to Rust's ownership model.
Inspiration: JAX jax/_src/dispatch.py:322-530 - tracks ownership with REUSE_INPUT, DONATE_INPUT, ALWAYS_COPY modes.
Problem
Current tensor operations always allocate new output buffers:
let a = Tensor::ones(&[1000, 1000]); // 4MB
let b = a.relu(); // Allocates new 4MB (a still valid but unused)
// a is dropped here, wasting the allocationThis causes:
- 2x memory usage for in-place-able operations
- Allocation overhead on every operation
- Cache pollution from unnecessary copies
Proposed Solution
1. Ownership Tracking
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;
/// Reference count + ownership state
#[derive(Debug)]
pub struct OwnershipState {
/// Number of active references
ref_count: AtomicU32,
/// Whether this buffer can be donated
donatable: AtomicBool,
/// Whether this buffer is exclusively owned
exclusive: AtomicBool,
}
impl OwnershipState {
pub fn new() -> Self {
Self {
ref_count: AtomicU32::new(1),
donatable: AtomicBool::new(true),
exclusive: AtomicBool::new(true),
}
}
/// Check if buffer can be reused in-place
pub fn can_reuse(&self) -> bool {
self.ref_count.load(Ordering::Acquire) == 1 &&
self.exclusive.load(Ordering::Acquire)
}
/// Check if buffer can be donated to another operation
pub fn can_donate(&self) -> bool {
self.can_reuse() && self.donatable.load(Ordering::Acquire)
}
/// Mark as shared (cannot reuse or donate)
pub fn mark_shared(&self) {
self.exclusive.store(false, Ordering::Release);
self.donatable.store(false, Ordering::Release);
}
/// Increment reference count
pub fn inc_ref(&self) {
self.ref_count.fetch_add(1, Ordering::AcqRel);
// Multiple references = not exclusive
self.exclusive.store(false, Ordering::Release);
}
/// Decrement reference count, return true if should free
pub fn dec_ref(&self) -> bool {
self.ref_count.fetch_sub(1, Ordering::AcqRel) == 1
}
}
/// Buffer with ownership tracking
pub struct OwnedBuffer {
data: *mut u8,
size: usize,
state: Arc<OwnershipState>,
/// Original allocator for deallocation
allocator: AllocatorId,
}
impl OwnedBuffer {
/// Try to get exclusive mutable access for in-place operation
pub fn try_get_mut(&mut self) -> Option<&mut [u8]> {
if self.state.can_reuse() {
// SAFETY: We have exclusive access
Some(unsafe { std::slice::from_raw_parts_mut(self.data, self.size) })
} else {
None
}
}
/// Donate this buffer to a new tensor (transfers ownership)
pub fn donate(self) -> Option<OwnedBuffer> {
if self.state.can_donate() {
Some(self)
} else {
None
}
}
}2. Copy Semantics Enum
/// How to handle input buffer during operation
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CopySemantics {
/// Reuse input buffer if exclusively owned (in-place mutation)
ReuseInput,
/// Take ownership of input buffer (donate to output)
DonateInput,
/// Always allocate new buffer (safe default)
AlwaysCopy,
}
impl Default for CopySemantics {
fn default() -> Self {
// Default to safe behavior
CopySemantics::AlwaysCopy
}
}
/// Builder for operation configuration
pub struct OpBuilder<'a, T> {
input: &'a Tensor<T>,
copy_semantics: CopySemantics,
}
impl<'a, T: TensorElement> OpBuilder<'a, T> {
pub fn new(input: &'a Tensor<T>) -> Self {
Self {
input,
copy_semantics: CopySemantics::AlwaysCopy,
}
}
/// Enable in-place mutation if input is exclusively owned
pub fn inplace_if_possible(mut self) -> Self {
if self.input.is_exclusively_owned() {
self.copy_semantics = CopySemantics::ReuseInput;
}
self
}
/// Donate input buffer to output (input becomes invalid)
pub fn donate(mut self) -> Self {
self.copy_semantics = CopySemantics::DonateInput;
self
}
/// Execute unary operation with configured semantics
pub fn apply<F>(self, op: F) -> Tensor<T>
where
F: FnOnce(&mut [T]),
{
match self.copy_semantics {
CopySemantics::ReuseInput => {
// Get mutable access and modify in place
let mut data = self.input.data_mut().expect("Not exclusively owned");
op(&mut data);
self.input.clone() // Return same tensor
}
CopySemantics::DonateInput => {
// Take the buffer and modify
let buffer = self.input.take_buffer().expect("Cannot donate");
let mut data = buffer.as_mut_slice();
op(&mut data);
Tensor::from_buffer(buffer, self.input.shape().to_vec())
}
CopySemantics::AlwaysCopy => {
// Allocate new buffer, copy, modify
let mut output = self.input.clone();
let mut data = output.data_mut().unwrap();
op(&mut data);
output
}
}
}
}3. Tensor API Integration
impl<T: TensorElement> Tensor<T> {
/// Check if this tensor exclusively owns its buffer
pub fn is_exclusively_owned(&self) -> bool {
self.buffer.state.can_reuse()
}
/// Get mutable data if exclusively owned
pub fn data_mut(&mut self) -> Option<&mut [T]> {
self.buffer.try_get_mut().map(|bytes| {
// SAFETY: Buffer is properly aligned for T
unsafe {
std::slice::from_raw_parts_mut(
bytes.as_mut_ptr() as *mut T,
bytes.len() / std::mem::size_of::<T>(),
)
}
})
}
/// Take ownership of buffer (self becomes empty)
pub fn take_buffer(&mut self) -> Option<OwnedBuffer> {
if self.buffer.state.can_donate() {
Some(std::mem::replace(&mut self.buffer, OwnedBuffer::empty()))
} else {
None
}
}
/// ReLU with optional in-place execution
pub fn relu(&self) -> Tensor<T> {
OpBuilder::new(self)
.inplace_if_possible()
.apply(|data| {
for x in data.iter_mut() {
if *x < T::zero() {
*x = T::zero();
}
}
})
}
/// ReLU that always mutates in-place (consumes self)
pub fn relu_inplace(mut self) -> Tensor<T> {
if let Some(data) = self.data_mut() {
for x in data.iter_mut() {
if *x < T::zero() {
*x = T::zero();
}
}
self
} else {
// Fallback: copy then mutate
self.relu()
}
}
/// Explicit donate API
pub fn relu_donate(self) -> Tensor<T> {
OpBuilder::new(&self)
.donate()
.apply(|data| {
for x in data.iter_mut() {
if *x < T::zero() {
*x = T::zero();
}
}
})
}
}4. Automatic Donation in Operations
/// Trait for operations that can donate input
pub trait Donatable<T> {
/// Execute with automatic input donation if beneficial
fn execute_with_donation(self, input: Tensor<T>) -> Tensor<T>;
}
/// Binary operation with donation support
pub fn binary_op_with_donation<T: TensorElement, F>(
mut a: Tensor<T>,
b: &Tensor<T>,
op: F,
) -> Tensor<T>
where
F: Fn(T, T) -> T,
{
// Try to reuse a's buffer if shapes match and a is donatable
if a.shape() == b.shape() && a.is_exclusively_owned() {
// In-place on a
let a_data = a.data_mut().unwrap();
let b_data = b.as_slice();
for (x, &y) in a_data.iter_mut().zip(b_data.iter()) {
*x = op(*x, y);
}
return a;
}
// Try to reuse b's buffer (if op is commutative and b is donatable)
// ... similar logic
// Fallback: allocate new
let mut output = Tensor::zeros(a.shape());
let out_data = output.data_mut().unwrap();
let a_data = a.as_slice();
let b_data = b.as_slice();
for ((o, &x), &y) in out_data.iter_mut().zip(a_data.iter()).zip(b_data.iter()) {
*o = op(x, y);
}
output
}
// Add with donation
impl<T: TensorElement> std::ops::Add for Tensor<T> {
type Output = Tensor<T>;
fn add(self, rhs: Tensor<T>) -> Tensor<T> {
binary_op_with_donation(self, &rhs, |a, b| a + b)
}
}
// Add-assign (always in-place if possible)
impl<T: TensorElement> std::ops::AddAssign<&Tensor<T>> for Tensor<T> {
fn add_assign(&mut self, rhs: &Tensor<T>) {
if let Some(data) = self.data_mut() {
let rhs_data = rhs.as_slice();
for (x, &y) in data.iter_mut().zip(rhs_data.iter()) {
*x = *x + y;
}
} else {
*self = self.clone() + rhs.clone();
}
}
}5. Compile-Time Donation Hints
/// Marker trait for types that can be donated
pub trait CanDonate {}
/// Wrapper that indicates intention to donate
#[repr(transparent)]
pub struct Donated<T>(pub T);
impl<T> CanDonate for Donated<T> {}
/// Extension trait for donation
pub trait DonateExt: Sized {
fn donate(self) -> Donated<Self> {
Donated(self)
}
}
impl<T: TensorElement> DonateExt for Tensor<T> {}
// Operations can specialize on Donated<Tensor>
impl<T: TensorElement> std::ops::Add<Tensor<T>> for Donated<Tensor<T>> {
type Output = Tensor<T>;
fn add(self, rhs: Tensor<T>) -> Tensor<T> {
// Guaranteed to try donation
binary_op_with_donation(self.0, &rhs, |a, b| a + b)
}
}
// Usage:
// let result = a.donate() + b; // Will reuse a's buffer6. Arena Allocator for Temporary Chains
/// Arena allocator for temporary tensors in a computation
pub struct TensorArena {
/// Allocated buffers
buffers: RefCell<Vec<OwnedBuffer>>,
/// Free list by size class
free_lists: RefCell<[Vec<OwnedBuffer>; 32]>, // Size classes: 2^5 to 2^36
/// Total allocated bytes
total_bytes: Cell<usize>,
/// Maximum bytes allowed
max_bytes: usize,
}
impl TensorArena {
pub fn new(max_bytes: usize) -> Self {
Self {
buffers: RefCell::new(Vec::new()),
free_lists: RefCell::new(std::array::from_fn(|_| Vec::new())),
total_bytes: Cell::new(0),
max_bytes,
}
}
/// Allocate a tensor from the arena
pub fn alloc<T: TensorElement>(&self, shape: &[usize]) -> ArenaTensor<T> {
let size = shape.iter().product::<usize>() * std::mem::size_of::<T>();
let size_class = size.next_power_of_two().trailing_zeros() as usize;
// Try free list first
let buffer = {
let mut free_lists = self.free_lists.borrow_mut();
if let Some(buf) = free_lists[size_class].pop() {
buf
} else {
// Allocate new
OwnedBuffer::allocate(size)
}
};
self.total_bytes.set(self.total_bytes.get() + size);
ArenaTensor {
buffer,
shape: shape.to_vec(),
arena: self,
_marker: PhantomData,
}
}
/// Return a buffer to the arena
fn return_buffer(&self, buffer: OwnedBuffer) {
let size_class = buffer.size.next_power_of_two().trailing_zeros() as usize;
self.total_bytes.set(self.total_bytes.get().saturating_sub(buffer.size));
self.free_lists.borrow_mut()[size_class].push(buffer);
}
/// Clear all allocations (call at end of computation)
pub fn clear(&self) {
self.buffers.borrow_mut().clear();
for list in self.free_lists.borrow_mut().iter_mut() {
list.clear();
}
self.total_bytes.set(0);
}
}
/// Tensor allocated from an arena
pub struct ArenaTensor<'a, T> {
buffer: OwnedBuffer,
shape: Vec<usize>,
arena: &'a TensorArena,
_marker: PhantomData<T>,
}
impl<'a, T> Drop for ArenaTensor<'a, T> {
fn drop(&mut self) {
// Return buffer to arena instead of freeing
self.arena.return_buffer(std::mem::replace(
&mut self.buffer,
OwnedBuffer::empty(),
));
}
}
// Usage in forward pass:
// let arena = TensorArena::new(1 << 30); // 1GB
// let x = arena.alloc::<f32>(&[batch, seq, hidden]);
// let y = arena.alloc::<f32>(&[batch, seq, hidden]);
// // Computation reuses arena memory
// arena.clear();Acceptance Criteria
-
OwnershipStatewith atomic ref counting -
CopySemanticsenum with REUSE/DONATE/COPY modes -
OpBuilderfor configuring operation semantics -
Tensor::data_mut()for exclusive mutable access -
Donated<T>wrapper for explicit donation - Binary operations with automatic donation
-
TensorArenafor computation chains - Benchmarks showing memory savings
- Unit tests for ownership correctness
Expected Performance Impact
| Operation | AlwaysCopy | With Donation | Memory Saved |
|---|---|---|---|
| ReLU chain (10 ops) | 40MB | 4MB | 90% |
| ResNet block | 200MB | 80MB | 60% |
| Transformer layer | 500MB | 200MB | 60% |
| Full forward pass | 2GB | 800MB | 60% |
30-60% memory reduction enabling larger batch sizes or longer sequences.
References
- JAX donation semantics:
jax/_src/dispatch.py:322-530 - Rust ownership: https://doc.rust-lang.org/book/ch04-00-understanding-ownership.html
- XLA buffer donation: https://www.tensorflow.org/xla/operation_semantics#buffer_donation
Labels
performance, memory, ownership, P1-high
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels