Document Arrow <--> Parquet schema conversion better (#7479)

alamb · tustvold · web-flow · commit 2c6c01cb3409 · 2025-05-09T14:34:24.000-04:00
* Document Arrow &lt;--&gt; Parquet schema conversion better

* Add a note about arrow metadata convention

* lint

* Fix links

* clarify what happens with provided schema

* More docs

* fmt

* more claritification

* More clarifications

* fmt

* Update parquet/src/arrow/arrow_reader/mod.rs

Co-authored-by: Raphael Taylor-Davies &lt;1781103+tustvold@users.noreply.github.com&gt;

* Update parquet/src/arrow/mod.rs

Co-authored-by: Raphael Taylor-Davies &lt;1781103+tustvold@users.noreply.github.com&gt;

* tweaks

* capitalization OCD

* capitalization OCD

---------

Co-authored-by: Raphael Taylor-Davies &lt;1781103+tustvold@users.noreply.github.com&gt;
diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
@@ -286,7 +286,10 @@ impl<T> ArrowReaderBuilder<T> {
 pub struct ArrowReaderOptions {
     /// Should the reader strip any user defined metadata from the Arrow schema
     skip_arrow_metadata: bool,
-    /// If provided used as the schema for the file, otherwise the schema is read from the file
+    /// If provided used as the schema hint when determining the Arrow schema,
+    /// otherwise the schema hint is read from the [ARROW_SCHEMA_META_KEY]
+    ///
+    /// [ARROW_SCHEMA_META_KEY]: crate::arrow::ARROW_SCHEMA_META_KEY
     supplied_schema: Option<SchemaRef>,
     /// If true, attempt to read `OffsetIndex` and `ColumnIndex`
     pub(crate) page_index: bool,
@@ -314,17 +317,27 @@ impl ArrowReaderOptions {
         }
     }
 
-    /// Provide a schema to use when reading the parquet file. If provided it
-    /// takes precedence over the schema inferred from the file or the schema defined
-    /// in the file's metadata. If the schema is not compatible with the file's
-    /// schema an error will be returned when constructing the builder.
+    /// Provide a schema hint to use when reading the Parquet file.
+    ///
+    /// If provided, this schema takes precedence over any arrow schema embedded
+    /// in the metadata (see the [`arrow`] documentation for more details).
+    ///
+    /// If the provided schema is not compatible with the data stored in the
+    /// parquet file schema, an error will be returned when constructing the
+    /// builder.
     ///
-    /// This option is only required if you want to cast columns to a different type.
-    /// For example, if you wanted to cast from an Int64 in the Parquet file to a Timestamp
-    /// in the Arrow schema.
+    /// This option is only required if you want to explicitly control the
+    /// conversion of Parquet types to Arrow types, such as casting a column to
+    /// a different type. For example, if you wanted to read an Int64 in
+    /// a Parquet file to a [`TimestampMicrosecondArray`] in the Arrow schema.
+    ///
+    /// [`arrow`]: crate::arrow
+    /// [`TimestampMicrosecondArray`]: arrow_array::TimestampMicrosecondArray
+    ///
+    /// # Notes
     ///
-    /// The supplied schema must have the same number of columns as the parquet schema and
-    /// the column names need to be the same.
+    /// The provided schema must have the same number of columns as the parquet schema and
+    /// the column names must be the same.
     ///
     /// # Example
     /// ```
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
@@ -15,13 +15,48 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! API for reading/writing
-//! Arrow [RecordBatch](arrow_array::RecordBatch)es and
-//! [Array](arrow_array::Array)s to/from Parquet Files.
+//! API for reading/writing Arrow [`RecordBatch`]es and [`Array`]s to/from
+//! Parquet Files.
 //!
-//! See the [crate-level documentation](crate) for more details.
+//! See the [crate-level documentation](crate) for more details on other APIs
 //!
-//! # Example of writing Arrow record batch to Parquet file
+//! # Schema Conversion
+//!
+//! These APIs ensure that data in Arrow [`RecordBatch`]es written to Parquet are
+//! read back as [`RecordBatch`]es with the exact same types and values.
+//!
+//! Parquet and Arrow have different type systems, and there is not
+//! always a one to one mapping between the systems. For example, data
+//! stored as a Parquet [`BYTE_ARRAY`] can be read as either an Arrow
+//! [`BinaryViewArray`] or [`BinaryArray`].
+//!
+//! To recover the original Arrow types, the writers in this module add a "hint" to
+//! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow
+//! schema. The metadata hint follows the same convention as arrow-cpp based
+//! implementations such as `pyarrow`. The reader looks for the schema hint in the
+//! metadata to determine Arrow types, and if it is not present, infers the Arrow schema
+//! from the Parquet schema.
+//!
+//! In situations where the embedded Arrow schema is not compatible with the Parquet
+//! schema, the Parquet schema takes precedence and no error is raised.
+//! See [#1663](https://github.com/apache/arrow-rs/issues/1663)
+//!
+//! You can also control the type conversion process in more detail using:
+//!
+//! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet
+//!   types.
+//!
+//! * [`ArrowReaderOptions::with_schema`] to explicitly specify your own Arrow schema hint
+//!   to use when reading Parquet, overriding any metadata that may be present.
+//!
+//! [`RecordBatch`]: arrow_array::RecordBatch
+//! [`Array`]: arrow_array::Array
+//! [`BYTE_ARRAY`]: crate::basic::Type::BYTE_ARRAY
+//! [`BinaryViewArray`]: arrow_array::BinaryViewArray
+//! [`BinaryArray`]: arrow_array::BinaryArray
+//! [`ArrowReaderOptions::with_schema`]: arrow_reader::ArrowReaderOptions::with_schema
+//!
+//! # Example: Writing Arrow `RecordBatch` to Parquet file
 //!
 //!```rust
 //! # use arrow_array::{Int32Array, ArrayRef};
@@ -53,7 +88,7 @@
 //! writer.close().unwrap();
 //! ```
 //!
-//! # Example of reading parquet file into arrow record batch
+//! # Example: Reading Parquet file into Arrow `RecordBatch`
 //!
 //! ```rust
 //! # use std::fs::File;
@@ -93,11 +128,10 @@
 //! println!("Read {} records.", record_batch.num_rows());
 //! ```
 //!
-//! # Example of reading non-uniformly encrypted parquet file into arrow record batch
+//! # Example: Reading non-uniformly encrypted parquet file into arrow record batch
 //!
 //! Note: This requires the experimental `encryption` feature to be enabled at compile time.
 //!
-//!
 #![cfg_attr(feature = "encryption", doc = "```rust")]
 #![cfg_attr(not(feature = "encryption"), doc = "```ignore")]
 //! # use arrow_array::{Int32Array, ArrayRef};
@@ -168,7 +202,6 @@ pub use self::async_reader::ParquetRecordBatchStreamBuilder;
 pub use self::async_writer::AsyncArrowWriter;
 use crate::schema::types::{SchemaDescriptor, Type};
 use arrow_schema::{FieldRef, Schema};
-
 // continue to export deprecated methods until they are removed
 #[allow(deprecated)]
 pub use self::schema::arrow_to_parquet_schema;
@@ -178,7 +211,10 @@ pub use self::schema::{
     parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter, FieldLevels,
 };
 
-/// Schema metadata key used to store serialized Arrow IPC schema
+/// Schema metadata key used to store serialized Arrow schema
+///
+/// The Arrow schema is encoded using the Arrow IPC format, and then base64
+/// encoded. This is the same format used by arrow-cpp systems, such as pyarrow.
 pub const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema";
 
 /// The value of this metadata key, if present on [`Field::metadata`], will be used
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
@@ -105,6 +105,12 @@ pub struct FieldLevels {
 ///
 /// Columns not included within [`ProjectionMask`] will be ignored.
 ///
+/// The optional `hint` parameter is the desired Arrow schema. See the
+/// [`arrow`] module documentation for more information.
+///
+/// [`arrow`]: crate::arrow
+///
+/// # Notes:
 /// Where a field type in `hint` is compatible with the corresponding parquet type in `schema`, it
 /// will be used, otherwise the default arrow type for the given parquet column type will be used.
 ///
@@ -192,8 +198,12 @@ pub fn encode_arrow_schema(schema: &Schema) -> String {
     BASE64_STANDARD.encode(&len_prefix_schema)
 }
 
-/// Mutates writer metadata by storing the encoded Arrow schema.
+/// Mutates writer metadata by storing the encoded Arrow schema hint in
+/// [`ARROW_SCHEMA_META_KEY`].
+///
 /// If there is an existing Arrow schema metadata, it is replaced.
+///
+/// [`ARROW_SCHEMA_META_KEY`]: crate::arrow::ARROW_SCHEMA_META_KEY
 pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) {
     let encoded = encode_arrow_schema(schema);
 
@@ -224,7 +234,12 @@ pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterP
 
 /// Converter for Arrow schema to Parquet schema
 ///
-/// Example:
+/// See the documentation on the [`arrow`] module for background
+/// information on how Arrow schema is represented in Parquet.
+///
+/// [`arrow`]: crate::arrow
+///
+/// # Example:
 /// ```
 /// # use std::sync::Arc;
 /// # use arrow_schema::{Field, Schema, DataType};