Skip to content

Commit 0d94525

Browse files
alambJefffrey
authored andcommitted
Improve Buffer documentation, deprecate Buffer::from_bytes add From<Bytes> and From<bytes::Bytes> impls (apache#6939)
* Improve Bytes documentation * Improve Buffer documentation, add From<Bytes> and From<bytes::Bytes> impls * avoid linking to private docs * Deprecate `Buffer::from_bytes` * Apply suggestions from code review Co-authored-by: Jeffrey Vo <jeffrey.vo.australia@gmail.com> --------- Co-authored-by: Jeffrey Vo <jeffrey.vo.australia@gmail.com>
1 parent b4313b4 commit 0d94525

6 files changed

Lines changed: 104 additions & 38 deletions

File tree

arrow-buffer/src/buffer/immutable.rs

Lines changed: 91 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,43 @@ use crate::{bit_util, bytes::Bytes, native::ArrowNativeType};
2828
use super::ops::bitwise_unary_op_helper;
2929
use super::{MutableBuffer, ScalarBuffer};
3030

31-
/// Buffer represents a contiguous memory region that can be shared with other buffers and across
32-
/// thread boundaries.
31+
/// A contiguous memory region that can be shared with other buffers and across
32+
/// thread boundaries that stores Arrow data.
33+
///
34+
/// `Buffer`s can be sliced and cloned without copying the underlying data and can
35+
/// be created from memory allocated by non-Rust sources such as C/C++.
36+
///
37+
/// # Example: Create a `Buffer` from a `Vec` (without copying)
38+
/// ```
39+
/// # use arrow_buffer::Buffer;
40+
/// let vec: Vec<u32> = vec![1, 2, 3];
41+
/// let buffer = Buffer::from(vec);
42+
/// ```
43+
///
44+
/// # Example: Convert a `Buffer` to a `Vec` (without copying)
45+
///
46+
/// Use [`Self::into_vec`] to convert a `Buffer` back into a `Vec` if there are
47+
/// no other references and the types are aligned correctly.
48+
/// ```
49+
/// # use arrow_buffer::Buffer;
50+
/// # let vec: Vec<u32> = vec![1, 2, 3];
51+
/// # let buffer = Buffer::from(vec);
52+
/// // convert the buffer back into a Vec of u32
53+
/// // note this will fail if the buffer is shared or not aligned correctly
54+
/// let vec: Vec<u32> = buffer.into_vec().unwrap();
55+
/// ```
56+
///
57+
/// # Example: Create a `Buffer` from a [`bytes::Bytes`] (without copying)
58+
///
59+
/// [`bytes::Bytes`] is a common type in the Rust ecosystem for shared memory
60+
/// regions. You can create a buffer from a `Bytes` instance using the `From`
61+
/// implementation, also without copying.
62+
///
63+
/// ```
64+
/// # use arrow_buffer::Buffer;
65+
/// let bytes = bytes::Bytes::from("hello");
66+
/// let buffer = Buffer::from(bytes);
67+
///```
3368
#[derive(Clone, Debug)]
3469
pub struct Buffer {
3570
/// the internal byte buffer.
@@ -59,24 +94,15 @@ unsafe impl Send for Buffer where Bytes: Send {}
5994
unsafe impl Sync for Buffer where Bytes: Sync {}
6095

6196
impl Buffer {
62-
/// Auxiliary method to create a new Buffer
97+
/// Create a new Buffer from a (internal) `Bytes`
6398
///
64-
/// This can be used with a [`bytes::Bytes`] via `into()`:
99+
/// NOTE despite the same name, `Bytes` is an internal struct in arrow-rs
100+
/// and is different than [`bytes::Bytes`].
65101
///
66-
/// ```
67-
/// # use arrow_buffer::Buffer;
68-
/// let bytes = bytes::Bytes::from_static(b"foo");
69-
/// let buffer = Buffer::from_bytes(bytes.into());
70-
/// ```
71-
#[inline]
102+
/// See examples on [`Buffer`] for ways to create a buffer from a [`bytes::Bytes`].
103+
#[deprecated(since = "54.1.0", note = "Use Buffer::from instead")]
72104
pub fn from_bytes(bytes: Bytes) -> Self {
73-
let length = bytes.len();
74-
let ptr = bytes.as_ptr();
75-
Buffer {
76-
data: Arc::new(bytes),
77-
ptr,
78-
length,
79-
}
105+
Self::from(bytes)
80106
}
81107

82108
/// Returns the offset, in bytes, of `Self::ptr` to `Self::data`
@@ -107,8 +133,11 @@ impl Buffer {
107133
buffer.into()
108134
}
109135

110-
/// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting
111-
/// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero.
136+
/// Creates a buffer from an existing memory region.
137+
///
138+
/// Ownership of the memory is tracked via reference counting
139+
/// and the memory will be freed using the `drop` method of
140+
/// [crate::alloc::Allocation] when the reference count reaches zero.
112141
///
113142
/// # Arguments
114143
///
@@ -155,7 +184,7 @@ impl Buffer {
155184
self.data.capacity()
156185
}
157186

158-
/// Tried to shrink the capacity of the buffer as much as possible, freeing unused memory.
187+
/// Tries to shrink the capacity of the buffer as much as possible, freeing unused memory.
159188
///
160189
/// If the buffer is shared, this is a no-op.
161190
///
@@ -190,7 +219,7 @@ impl Buffer {
190219
}
191220
}
192221

193-
/// Returns whether the buffer is empty.
222+
/// Returns true if the buffer is empty.
194223
#[inline]
195224
pub fn is_empty(&self) -> bool {
196225
self.length == 0
@@ -206,7 +235,9 @@ impl Buffer {
206235
}
207236

208237
/// Returns a new [Buffer] that is a slice of this buffer starting at `offset`.
209-
/// Doing so allows the same memory region to be shared between buffers.
238+
///
239+
/// This function is `O(1)` and does not copy any data, allowing the
240+
/// same memory region to be shared between buffers.
210241
///
211242
/// # Panics
212243
///
@@ -240,7 +271,10 @@ impl Buffer {
240271

241272
/// Returns a new [Buffer] that is a slice of this buffer starting at `offset`,
242273
/// with `length` bytes.
243-
/// Doing so allows the same memory region to be shared between buffers.
274+
///
275+
/// This function is `O(1)` and does not copy any data, allowing the same
276+
/// memory region to be shared between buffers.
277+
///
244278
/// # Panics
245279
/// Panics iff `(offset + length)` is larger than the existing length.
246280
pub fn slice_with_length(&self, offset: usize, length: usize) -> Self {
@@ -328,10 +362,16 @@ impl Buffer {
328362
})
329363
}
330364

331-
/// Returns `Vec` for mutating the buffer
365+
/// Converts self into a `Vec`, if possible.
366+
///
367+
/// This can be used to reuse / mutate the underlying data.
332368
///
333-
/// Returns `Err(self)` if this buffer does not have the same [`Layout`] as
334-
/// the destination Vec or contains a non-zero offset
369+
/// # Errors
370+
///
371+
/// Returns `Err(self)` if
372+
/// 1. this buffer does not have the same [`Layout`] as the destination Vec
373+
/// 2. contains a non-zero offset
374+
/// 3. The buffer is shared
335375
pub fn into_vec<T: ArrowNativeType>(self) -> Result<Vec<T>, Self> {
336376
let layout = match self.data.deallocation() {
337377
Deallocation::Standard(l) => l,
@@ -414,7 +454,29 @@ impl<T: ArrowNativeType> From<ScalarBuffer<T>> for Buffer {
414454
}
415455
}
416456

417-
/// Creating a `Buffer` instance by storing the boolean values into the buffer
457+
/// Convert from internal `Bytes` (not [`bytes::Bytes`]) to `Buffer`
458+
impl From<Bytes> for Buffer {
459+
#[inline]
460+
fn from(bytes: Bytes) -> Self {
461+
let length = bytes.len();
462+
let ptr = bytes.as_ptr();
463+
Self {
464+
data: Arc::new(bytes),
465+
ptr,
466+
length,
467+
}
468+
}
469+
}
470+
471+
/// Convert from [`bytes::Bytes`], not internal `Bytes` to `Buffer`
472+
impl From<bytes::Bytes> for Buffer {
473+
fn from(bytes: bytes::Bytes) -> Self {
474+
let bytes: Bytes = bytes.into();
475+
Self::from(bytes)
476+
}
477+
}
478+
479+
/// Create a `Buffer` instance by storing the boolean values into the buffer
418480
impl FromIterator<bool> for Buffer {
419481
fn from_iter<I>(iter: I) -> Self
420482
where
@@ -447,7 +509,9 @@ impl<T: ArrowNativeType> From<BufferBuilder<T>> for Buffer {
447509

448510
impl Buffer {
449511
/// Creates a [`Buffer`] from an [`Iterator`] with a trusted (upper) length.
512+
///
450513
/// Prefer this to `collect` whenever possible, as it is ~60% faster.
514+
///
451515
/// # Example
452516
/// ```
453517
/// # use arrow_buffer::buffer::Buffer;

arrow-buffer/src/buffer/mutable.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ impl MutableBuffer {
328328
pub(super) fn into_buffer(self) -> Buffer {
329329
let bytes = unsafe { Bytes::new(self.data, self.len, Deallocation::Standard(self.layout)) };
330330
std::mem::forget(self);
331-
Buffer::from_bytes(bytes)
331+
Buffer::from(bytes)
332332
}
333333

334334
/// View this buffer as a mutable slice of a specific type.

arrow-buffer/src/bytes.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,18 @@ use crate::buffer::dangling_ptr;
2828

2929
/// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself.
3030
///
31-
/// This structs' API is inspired by the `bytes::Bytes`, but it is not limited to using rust's
32-
/// global allocator nor u8 alignment.
31+
/// Note that this structure is an internal implementation detail of the
32+
/// arrow-rs crate. While it has the same name and similar API as
33+
/// [`bytes::Bytes`] it is not limited to rust's global allocator nor u8
34+
/// alignment. It is possible to create a `Bytes` from `bytes::Bytes` using the
35+
/// `From` implementation.
3336
///
3437
/// In the most common case, this buffer is allocated using [`alloc`](std::alloc::alloc)
3538
/// with an alignment of [`ALIGNMENT`](crate::alloc::ALIGNMENT)
3639
///
3740
/// When the region is allocated by a different allocator, [Deallocation::Custom], this calls the
3841
/// custom deallocator to deallocate the region when it is no longer needed.
42+
///
3943
pub struct Bytes {
4044
/// The raw pointer to be beginning of the region
4145
ptr: NonNull<u8>,

arrow-flight/src/decode.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ impl FlightDataDecoder {
295295
));
296296
};
297297

298-
let buffer = Buffer::from_bytes(data.data_body.into());
298+
let buffer = Buffer::from(data.data_body);
299299
let dictionary_batch = message.header_as_dictionary_batch().ok_or_else(|| {
300300
FlightError::protocol(
301301
"Could not get dictionary batch from DictionaryBatch message",

arrow-flight/src/sql/client.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ pub fn arrow_data_from_flight_data(
721721

722722
let dictionaries_by_field = HashMap::new();
723723
let record_batch = read_record_batch(
724-
&Buffer::from_bytes(flight_data.data_body.into()),
724+
&Buffer::from(flight_data.data_body),
725725
ipc_record_batch,
726726
arrow_schema_ref.clone(),
727727
&dictionaries_by_field,

parquet/src/arrow/array_reader/byte_view_array.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -316,9 +316,8 @@ impl ByteViewArrayDecoderPlain {
316316
}
317317

318318
pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result<usize> {
319-
// Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy
320-
// Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy
321-
let buf = arrow_buffer::Buffer::from_bytes(self.buf.clone().into());
319+
// Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer`
320+
let buf = arrow_buffer::Buffer::from(self.buf.clone());
322321
let block_id = output.append_block(buf);
323322

324323
let to_read = len.min(self.max_remaining_values);
@@ -549,9 +548,8 @@ impl ByteViewArrayDecoderDeltaLength {
549548

550549
let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_read];
551550

552-
// Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy
553-
// Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy
554-
let bytes = arrow_buffer::Buffer::from_bytes(self.data.clone().into());
551+
// Zero copy convert `bytes::Bytes` into `arrow_buffer::Buffer`
552+
let bytes = Buffer::from(self.data.clone());
555553
let block_id = output.append_block(bytes);
556554

557555
let mut current_offset = self.data_offset;

0 commit comments

Comments
 (0)