1515// specific language governing permissions and limitations
1616// under the License.
1717
18- //! API for reading/writing
19- //! Arrow [RecordBatch](arrow_array::RecordBatch)es and
20- //! [Array](arrow_array::Array)s to/from Parquet Files.
18+ //! API for reading/writing Arrow [`RecordBatch`]es and [`Array`]s to/from
19+ //! Parquet Files.
2120//!
22- //! See the [crate-level documentation](crate) for more details.
21+ //! See the [crate-level documentation](crate) for more details on other APIs
2322//!
24- //! # Example of writing Arrow record batch to Parquet file
23+ //! # Schema Conversion
24+ //!
25+ //! These APIs ensure that data in Arrow [`RecordBatch`]es written to Parquet are
26+ //! read back as [`RecordBatch`]es with the exact same types and values.
27+ //!
28+ //! Parquet and Arrow have different type systems, and there is not
29+ //! always a one to one mapping between the systems. For example, data
30+ //! stored as a Parquet [`BYTE_ARRAY`] can be read as either an Arrow
31+ //! [`BinaryViewArray`] or [`BinaryArray`].
32+ //!
33+ //! To recover the original Arrow types, the writers in this module add a "hint" to
34+ //! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow
35+ //! schema. The metadata hint follows the same convention as arrow-cpp based
36+ //! implementations such as `pyarrow`. The reader looks for the schema hint in the
37+ //! metadata to determine Arrow types, and if it is not present, infers the Arrow schema
38+ //! from the Parquet schema.
39+ //!
40+ //! In situations where the embedded Arrow schema is not compatible with the Parquet
41+ //! schema, the Parquet schema takes precedence and no error is raised.
42+ //! See [#1663](https://github.com/apache/arrow-rs/issues/1663)
43+ //!
44+ //! You can also control the type conversion process in more detail using:
45+ //!
46+ //! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet
47+ //! types.
48+ //!
49+ //! * [`ArrowReaderOptions::with_schema`] to explicitly specify your own Arrow schema hint
50+ //! to use when reading Parquet, overriding any metadata that may be present.
51+ //!
52+ //! [`RecordBatch`]: arrow_array::RecordBatch
53+ //! [`Array`]: arrow_array::Array
54+ //! [`BYTE_ARRAY`]: crate::basic::Type::BYTE_ARRAY
55+ //! [`BinaryViewArray`]: arrow_array::BinaryViewArray
56+ //! [`BinaryArray`]: arrow_array::BinaryArray
57+ //! [`ArrowReaderOptions::with_schema`]: arrow_reader::ArrowReaderOptions::with_schema
58+ //!
59+ //! # Example: Writing Arrow `RecordBatch` to Parquet file
2560//!
2661//!```rust
2762//! # use arrow_array::{Int32Array, ArrayRef};
5388//! writer.close().unwrap();
5489//! ```
5590//!
56- //! # Example of reading parquet file into arrow record batch
91+ //! # Example: Reading Parquet file into Arrow `RecordBatch`
5792//!
5893//! ```rust
5994//! # use std::fs::File;
93128//! println!("Read {} records.", record_batch.num_rows());
94129//! ```
95130//!
96- //! # Example of reading non-uniformly encrypted parquet file into arrow record batch
131+ //! # Example: Reading non-uniformly encrypted parquet file into arrow record batch
97132//!
98133//! Note: This requires the experimental `encryption` feature to be enabled at compile time.
99134//!
100- //!
101135#![ cfg_attr( feature = "encryption" , doc = "```rust" ) ]
102136#![ cfg_attr( not( feature = "encryption" ) , doc = "```ignore" ) ]
103137//! # use arrow_array::{Int32Array, ArrayRef};
@@ -168,7 +202,6 @@ pub use self::async_reader::ParquetRecordBatchStreamBuilder;
168202pub use self :: async_writer:: AsyncArrowWriter ;
169203use crate :: schema:: types:: { SchemaDescriptor , Type } ;
170204use arrow_schema:: { FieldRef , Schema } ;
171-
172205// continue to export deprecated methods until they are removed
173206#[ allow( deprecated) ]
174207pub use self :: schema:: arrow_to_parquet_schema;
@@ -178,7 +211,10 @@ pub use self::schema::{
178211 parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter , FieldLevels ,
179212} ;
180213
181- /// Schema metadata key used to store serialized Arrow IPC schema
214+ /// Schema metadata key used to store serialized Arrow schema
215+ ///
216+ /// The Arrow schema is encoded using the Arrow IPC format, and then base64
217+ /// encoded. This is the same format used by arrow-cpp systems, such as pyarrow.
182218pub const ARROW_SCHEMA_META_KEY : & str = "ARROW:schema" ;
183219
184220/// The value of this metadata key, if present on [`Field::metadata`], will be used
0 commit comments