diff --git a/Cargo.lock b/Cargo.lock index 92bfd48c5142..24bf55bc52aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,9 +246,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1bb018b6960c87fd9d025009820406f74e83281185a8bdcb44880d2aa5c9a87" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-arith", "arrow-array", @@ -259,20 +258,19 @@ dependencies = [ "arrow-ipc", "arrow-json", "arrow-ord", + "arrow-pyarrow", "arrow-row", "arrow-schema", "arrow-select", "arrow-string", "half", - "pyo3", "rand 0.9.1", ] [[package]] name = "arrow-arith" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44de76b51473aa888ecd6ad93ceb262fb8d40d1f1154a4df2f069b3590aa7575" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -284,9 +282,8 @@ dependencies = [ [[package]] name = "arrow-array" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29ed77e22744475a9a53d00026cf8e166fe73cf42d89c4c4ae63607ee1cfcc3f" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -301,9 +298,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0391c96eb58bf7389171d1e103112d3fc3e5625ca6b372d606f2688f1ea4cce" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "bytes", "half", @@ -312,9 +308,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f39e1d774ece9292697fcbe06b5584401b26bd34be1bec25c33edae65c2420ff" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -333,9 +328,8 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9055c972a07bf12c2a827debfd34f88d3b93da1941d36e1d9fee85eebe38a12a" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-cast", @@ -343,15 +337,13 @@ dependencies = [ "chrono", "csv", "csv-core", - "lazy_static", "regex", ] [[package]] name = "arrow-data" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf75ac27a08c7f48b88e5c923f267e980f27070147ab74615ad85b5c5f90473d" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-buffer", "arrow-schema", @@ -361,9 +353,8 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91efc67a4f5a438833dd76ef674745c80f6f6b9a428a3b440cbfbf74e32867e6" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-arith", "arrow-array", @@ -388,9 +379,8 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a222f0d93772bd058d1268f4c28ea421a603d66f7979479048c429292fac7b2e" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -402,9 +392,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9085342bbca0f75e8cb70513c0807cc7351f1fbf5cb98192a67d5e3044acb033" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -424,9 +413,8 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab2f1065a5cad7b9efa9e22ce5747ce826aa3855766755d4904535123ef431e7" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -435,11 +423,21 @@ dependencies = [ "arrow-select", ] +[[package]] +name = "arrow-pyarrow" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" +dependencies = [ + "arrow-array", + "arrow-data", + "arrow-schema", + "pyo3", +] + [[package]] name = "arrow-row" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3703a0e3e92d23c3f756df73d2dc9476873f873a76ae63ef9d3de17fda83b2d8" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -450,9 +448,8 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73a47aa0c771b5381de2b7f16998d351a6f4eb839f1e13d48353e17e873d969b" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "bitflags 2.9.1", "serde", @@ -461,9 +458,8 @@ dependencies = [ [[package]] name = "arrow-select" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b7b85575702b23b85272b01bc1c25a01c9b9852305e5d0078c79ba25d995d4" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -475,9 +471,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9260fddf1cdf2799ace2b4c2fc0356a9789fa7551e0953e35435536fecefebbd" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4420,9 +4415,8 @@ dependencies = [ [[package]] name = "parquet" -version = "55.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be7b2d778f6b841d37083ebdf32e33a524acde1266b5884a8ca29bf00dfa1231" +version = "55.2.0" +source = "git+https://github.com/spiceai/arrow-rs.git?rev=53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2#53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" dependencies = [ "ahash 0.8.12", "arrow-array", diff --git a/Cargo.toml b/Cargo.toml index 366701bdae2e..973cb5933ce1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -218,3 +218,11 @@ uninlined_format_args = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)", "cfg(tarpaulin_include)"] } unused_qualifications = "deny" +[patch.crates-io] +arrow = { git = "https://github.com/spiceai/arrow-rs.git", rev = "53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" } # spiceai-55.2 +arrow-buffer = { git = "https://github.com/spiceai/arrow-rs.git", rev = "53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" } # spiceai-55.2 +arrow-flight = { git = "https://github.com/spiceai/arrow-rs.git", rev = "53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" } # spiceai-55.2 +arrow-ipc = { git = "https://github.com/spiceai/arrow-rs.git", rev = "53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" } # spiceai-55.2 +arrow-ord = { git = "https://github.com/spiceai/arrow-rs.git", rev = "53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" } # spiceai-55.2 +arrow-schema = { git = "https://github.com/spiceai/arrow-rs.git", rev = "53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" } # spiceai-55.2 +parquet = { git = "https://github.com/spiceai/arrow-rs.git", rev = "53162ed30fe6a2ed219b0af4dbbcd5d14745d7c2" } # spiceai-55.2 diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 883d2b60a897..0753509650e4 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -430,6 +430,10 @@ config_namespace! { /// rows decoded. pub enable_page_index: bool, default = true + /// (reading) If true, the parquet reader will tolerate missing page index metadata + /// rather than error out that page index metadata was expected. + pub tolerate_missing_page_index: bool, default = false + /// (reading) If true, the parquet reader attempts to skip entire row groups based /// on the predicate in the query and the metadata (min/max values) stored in /// the parquet file diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 07e763f0ee6f..1817edf5ab12 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -228,6 +228,7 @@ impl ParquetOptions { // not in WriterProperties enable_page_index: _, + tolerate_missing_page_index: _, pruning: _, skip_metadata: _, metadata_size_hint: _, @@ -502,6 +503,7 @@ mod tests { // not in WriterProperties, but itemizing here to not skip newly added props enable_page_index: defaults.enable_page_index, + tolerate_missing_page_index: defaults.tolerate_missing_page_index, pruning: defaults.pruning, skip_metadata: defaults.skip_metadata, metadata_size_hint: defaults.metadata_size_hint, @@ -608,6 +610,8 @@ mod tests { // not in WriterProperties enable_page_index: global_options_defaults.enable_page_index, + tolerate_missing_page_index: global_options_defaults + .tolerate_missing_page_index, pruning: global_options_defaults.pruning, skip_metadata: global_options_defaults.skip_metadata, metadata_size_hint: global_options_defaults.metadata_size_hint, diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 9e14425074f7..d94f499aac17 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -41,7 +41,7 @@ use log::debug; use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::file::metadata::ParquetMetaDataReader; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader}; /// Implements [`FileOpener`] for a parquet file pub(super) struct ParquetOpener { @@ -73,6 +73,8 @@ pub(super) struct ParquetOpener { /// Should the page index be read from parquet files, if present, to skip /// data pages pub enable_page_index: bool, + /// Should the Parquet reader tolerate missing page indexes? + pub tolerate_missing_page_index: bool, /// Should the bloom filter be read from parquet, if present, to skip row /// groups pub enable_bloom_filter: bool, @@ -123,6 +125,7 @@ impl FileOpener for ParquetOpener { .global_counter("num_predicate_creation_errors"); let enable_page_index = self.enable_page_index; + let tolerate_missing_page_index = self.tolerate_missing_page_index; Ok(Box::pin(async move { // Don't load the page index yet. Since it is not stored inline in @@ -190,11 +193,16 @@ impl FileOpener for ParquetOpener { // code above may not have read the page index structures yet. If we // need them for reading and they aren't yet loaded, we need to load them now. if should_enable_page_index(enable_page_index, &page_pruning_predicate) { + let page_index_policy = if tolerate_missing_page_index { + PageIndexPolicy::Optional + } else { + PageIndexPolicy::Required + }; reader_metadata = load_page_index( reader_metadata, &mut async_file_reader, - // Since we're manually loading the page index the option here should not matter but we pass it in for consistency - options.with_page_index(true), + options.with_page_index_policy(page_index_policy), + page_index_policy, ) .await?; } @@ -418,6 +426,7 @@ async fn load_page_index( reader_metadata: ArrowReaderMetadata, input: &mut T, options: ArrowReaderOptions, + page_index_policy: PageIndexPolicy, ) -> Result { let parquet_metadata = reader_metadata.metadata(); let missing_column_index = parquet_metadata.column_index().is_none(); @@ -430,8 +439,9 @@ async fn load_page_index( if missing_column_index || missing_offset_index { let m = Arc::try_unwrap(Arc::clone(parquet_metadata)) .unwrap_or_else(|e| e.as_ref().clone()); - let mut reader = - ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true); + let mut reader = ParquetMetaDataReader::new_with_metadata(m) + .with_page_index_policy(page_index_policy); + reader.load_page_index(input).await?; let new_parquet_metadata = reader.finish()?; let new_arrow_reader = diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index c3658280ecb4..f489da39b05b 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -404,6 +404,26 @@ impl ParquetSource { self.table_parquet_options.global.enable_page_index } + /// If enabled, the reader will not error if + /// the page index is missing from a parquet + /// file and `enable_page_index` is true. + pub fn with_tolerate_missing_page_index( + mut self, + tolerate_missing_page_index: bool, + ) -> Self { + self.table_parquet_options + .global + .tolerate_missing_page_index = tolerate_missing_page_index; + self + } + + /// Return the value described in [`Self::with_tolerate_missing_page_index`] + fn tolerate_missing_page_index(&self) -> bool { + self.table_parquet_options + .global + .tolerate_missing_page_index + } + /// If enabled, the reader will read by the bloom filter pub fn with_bloom_filter_on_read(mut self, bloom_filter_on_read: bool) -> Self { self.table_parquet_options.global.bloom_filter_on_read = bloom_filter_on_read; @@ -515,6 +535,7 @@ impl FileSource for ParquetSource { pushdown_filters: self.pushdown_filters(), reorder_filters: self.reorder_filters(), enable_page_index: self.enable_page_index(), + tolerate_missing_page_index: self.tolerate_missing_page_index(), enable_bloom_filter: self.bloom_filter_on_read(), enable_row_group_stats_pruning: self.table_parquet_options.global.pruning, schema_adapter_factory, diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 35f41155fa05..7cfddba90132 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -548,6 +548,8 @@ message ParquetOptions { oneof coerce_int96_opt { string coerce_int96 = 32; } + + bool tolerate_missing_page_index = 33; // default = false } enum JoinSide { diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index bd969db31687..f60a0fb277bd 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -913,6 +913,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { #[allow(deprecated)] // max_statistics_size Ok(ParquetOptions { enable_page_index: value.enable_page_index, + tolerate_missing_page_index: value.tolerate_missing_page_index, pruning: value.pruning, skip_metadata: value.skip_metadata, metadata_size_hint: value diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 1ac35742c73a..76613cf3c445 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -4933,6 +4933,9 @@ impl serde::Serialize for ParquetOptions { if !self.created_by.is_empty() { len += 1; } + if self.tolerate_missing_page_index { + len += 1; + } if self.metadata_size_hint_opt.is_some() { len += 1; } @@ -5041,6 +5044,9 @@ impl serde::Serialize for ParquetOptions { if !self.created_by.is_empty() { struct_ser.serialize_field("createdBy", &self.created_by)?; } + if self.tolerate_missing_page_index { + struct_ser.serialize_field("tolerateMissingPageIndex", &self.tolerate_missing_page_index)?; + } if let Some(v) = self.metadata_size_hint_opt.as_ref() { match v { parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v) => { @@ -5177,6 +5183,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "maxRowGroupSize", "created_by", "createdBy", + "tolerate_missing_page_index", + "tolerateMissingPageIndex", "metadata_size_hint", "metadataSizeHint", "compression", @@ -5221,6 +5229,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { DataPageRowCountLimit, MaxRowGroupSize, CreatedBy, + TolerateMissingPageIndex, MetadataSizeHint, Compression, DictionaryEnabled, @@ -5273,6 +5282,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dataPageRowCountLimit" | "data_page_row_count_limit" => Ok(GeneratedField::DataPageRowCountLimit), "maxRowGroupSize" | "max_row_group_size" => Ok(GeneratedField::MaxRowGroupSize), "createdBy" | "created_by" => Ok(GeneratedField::CreatedBy), + "tolerateMissingPageIndex" | "tolerate_missing_page_index" => Ok(GeneratedField::TolerateMissingPageIndex), "metadataSizeHint" | "metadata_size_hint" => Ok(GeneratedField::MetadataSizeHint), "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), @@ -5323,6 +5333,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut data_page_row_count_limit__ = None; let mut max_row_group_size__ = None; let mut created_by__ = None; + let mut tolerate_missing_page_index__ = None; let mut metadata_size_hint_opt__ = None; let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; @@ -5470,6 +5481,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } created_by__ = Some(map_.next_value()?); } + GeneratedField::TolerateMissingPageIndex => { + if tolerate_missing_page_index__.is_some() { + return Err(serde::de::Error::duplicate_field("tolerateMissingPageIndex")); + } + tolerate_missing_page_index__ = Some(map_.next_value()?); + } GeneratedField::MetadataSizeHint => { if metadata_size_hint_opt__.is_some() { return Err(serde::de::Error::duplicate_field("metadataSizeHint")); @@ -5559,6 +5576,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { data_page_row_count_limit: data_page_row_count_limit__.unwrap_or_default(), max_row_group_size: max_row_group_size__.unwrap_or_default(), created_by: created_by__.unwrap_or_default(), + tolerate_missing_page_index: tolerate_missing_page_index__.unwrap_or_default(), metadata_size_hint_opt: metadata_size_hint_opt__, compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index a55714f190c5..3e9c9aa1ad35 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -770,6 +770,9 @@ pub struct ParquetOptions { pub max_row_group_size: u64, #[prost(string, tag = "16")] pub created_by: ::prost::alloc::string::String, + /// default = false + #[prost(bool, tag = "33")] + pub tolerate_missing_page_index: bool, #[prost(oneof = "parquet_options::MetadataSizeHintOpt", tags = "4")] pub metadata_size_hint_opt: ::core::option::Option< parquet_options::MetadataSizeHintOpt, diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index b6cbe5759cfc..094e6b4b5894 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -805,6 +805,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { fn try_from(value: &ParquetOptions) -> datafusion_common::Result { Ok(protobuf::ParquetOptions { enable_page_index: value.enable_page_index, + tolerate_missing_page_index: value.tolerate_missing_page_index, pruning: value.pruning, skip_metadata: value.skip_metadata, metadata_size_hint_opt: value.metadata_size_hint.map(|v| protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v as u64)), diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index a55714f190c5..3e9c9aa1ad35 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -770,6 +770,9 @@ pub struct ParquetOptions { pub max_row_group_size: u64, #[prost(string, tag = "16")] pub created_by: ::prost::alloc::string::String, + /// default = false + #[prost(bool, tag = "33")] + pub tolerate_missing_page_index: bool, #[prost(oneof = "parquet_options::MetadataSizeHintOpt", tags = "4")] pub metadata_size_hint_opt: ::core::option::Option< parquet_options::MetadataSizeHintOpt, diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index d3f6511ec98f..a38dc9edb00e 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -362,6 +362,7 @@ impl TableParquetOptionsProto { TableParquetOptionsProto { global: Some(ParquetOptionsProto { enable_page_index: global_options.global.enable_page_index, + tolerate_missing_page_index: global_options.global.tolerate_missing_page_index, pruning: global_options.global.pruning, skip_metadata: global_options.global.skip_metadata, metadata_size_hint_opt: global_options.global.metadata_size_hint.map(|size| { @@ -461,6 +462,7 @@ impl From<&ParquetOptionsProto> for ParquetOptions { #[allow(deprecated)] // max_statistics_size ParquetOptions { enable_page_index: proto.enable_page_index, + tolerate_missing_page_index: proto.tolerate_missing_page_index, pruning: proto.pruning, skip_metadata: proto.skip_metadata, metadata_size_hint: proto.metadata_size_hint_opt.as_ref().map(|opt| match opt {