Skip to content

Commit a1b333c

Browse files
authored
Implement data page pruning using Parquet page index stats (#18873)
Contributes to #17896. Part of #18011. Implements feature request in #9269 This PR implements discarding of Parquet data pages using the page level (min/max) statistics contained in the page index section of a parquet file, in the experimental Parquet reader for optimizing hybrid scan queries. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Shruti Shivakumar (https://github.com/shrshi) URL: #18873
1 parent dc196bd commit a1b333c

File tree

6 files changed

+1112
-18
lines changed

6 files changed

+1112
-18
lines changed

cpp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,7 @@ add_library(
534534
src/io/parquet/experimental/hybrid_scan.cpp
535535
src/io/parquet/experimental/hybrid_scan_helpers.cpp
536536
src/io/parquet/experimental/hybrid_scan_impl.cpp
537+
src/io/parquet/experimental/page_index_filter.cu
537538
src/io/parquet/page_data.cu
538539
src/io/parquet/chunk_dict.cu
539540
src/io/parquet/page_enc.cu

cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,49 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
203203
host_span<cudf::size_type const> output_column_schemas,
204204
std::reference_wrapper<ast::expression const> filter,
205205
rmm::cuda_stream_view stream) const;
206+
207+
/**
208+
* @brief Filter data pages using statistics page-level statistics based on predicate filter
209+
*
210+
* @param row_group_indices Input row groups indices
211+
* @param output_dtypes Datatypes of output columns
212+
* @param output_column_schemas schema indices of output columns
213+
* @param filter AST expression to filter data pages based on `PageIndex` statistics
214+
* @param stream CUDA stream used for device memory operations and kernel launches
215+
* @param mr Device memory resource used to allocate the returned column's device memory
216+
*
217+
* @return A boolean column representing a mask of rows surviving the predicate filter at
218+
* page-level
219+
*/
220+
[[nodiscard]] std::unique_ptr<cudf::column> filter_data_pages_with_stats(
221+
cudf::host_span<std::vector<size_type> const> row_group_indices,
222+
cudf::host_span<cudf::data_type const> output_dtypes,
223+
cudf::host_span<cudf::size_type const> output_column_schemas,
224+
std::reference_wrapper<ast::expression const> filter,
225+
rmm::cuda_stream_view stream,
226+
rmm::device_async_resource_ref mr) const;
227+
228+
/**
229+
* @brief Computes which data pages need decoding to construct input columns based on the row mask
230+
*
231+
* Compute a vector of boolean vectors indicating which data pages need to be decoded to
232+
* construct each input column based on the row mask, one vector per column
233+
*
234+
* @param row_mask Boolean column indicating which rows need to be read after page-pruning
235+
* @param row_group_indices Input row groups indices
236+
* @param output_dtypes Datatypes of output columns
237+
* @param output_column_schemas schema indices of output columns
238+
* @param stream CUDA stream used for device memory operations and kernel launches
239+
*
240+
* @return A vector of boolean vectors indicating which data pages need to be decoded to produce
241+
* the output table based on the input row mask, one per input column
242+
*/
243+
[[nodiscard]] std::vector<thrust::host_vector<bool>> compute_data_page_mask(
244+
cudf::column_view row_mask,
245+
cudf::host_span<std::vector<size_type> const> row_group_indices,
246+
cudf::host_span<cudf::data_type const> output_dtypes,
247+
cudf::host_span<cudf::size_type const> output_column_schemas,
248+
rmm::cuda_stream_view stream) const;
206249
};
207250

208251
/**

cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,29 @@ hybrid_scan_reader_impl::filter_data_pages_with_stats(
262262
rmm::cuda_stream_view stream,
263263
rmm::device_async_resource_ref mr)
264264
{
265-
return {};
265+
CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered");
266+
CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression");
267+
268+
select_columns(read_mode::FILTER_COLUMNS, options);
269+
270+
table_metadata metadata;
271+
populate_metadata(metadata);
272+
auto expr_conv = named_to_reference_converter(options.get_filter(), metadata);
273+
CUDF_EXPECTS(expr_conv.get_converted_expr().has_value(),
274+
"Columns names in filter expression must be convertible to index references");
275+
auto output_dtypes = get_output_types(_output_buffers_template);
276+
277+
auto row_mask = _metadata->filter_data_pages_with_stats(row_group_indices,
278+
output_dtypes,
279+
_output_column_schemas,
280+
expr_conv.get_converted_expr().value(),
281+
stream,
282+
mr);
283+
284+
auto data_page_mask = _metadata->compute_data_page_mask(
285+
row_mask->view(), row_group_indices, output_dtypes, _output_column_schemas, stream);
286+
287+
return {std::move(row_mask), std::move(data_page_mask)};
266288
}
267289

268290
std::pair<std::vector<byte_range_info>, std::vector<cudf::size_type>>

0 commit comments

Comments
 (0)