Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d01acf9
Allow parquet readers to use pre-materialized metadatas
mhaseeb123 Nov 21, 2025
2c83509
Allow parquet readers to use external `datasource`s
mhaseeb123 Nov 21, 2025
2cb9738
Improve docs
mhaseeb123 Nov 21, 2025
67bd2bc
Add more tests
mhaseeb123 Nov 21, 2025
8bede77
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Nov 21, 2025
b81c66a
Fix docs
mhaseeb123 Nov 21, 2025
ebce1a0
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Nov 21, 2025
f7ce471
Use multithreaded `setup_page_index` in hybrid scan reader
mhaseeb123 Nov 25, 2025
034983a
Merge branch 'main' into fea/multithreaded-setup-pgidx
mhaseeb123 Nov 25, 2025
8b85949
style fix
mhaseeb123 Nov 25, 2025
e9e8976
Minor improvements
mhaseeb123 Nov 25, 2025
3795225
Merge branch 'fea/multithreaded-setup-pgidx' of https://github.com/mh…
mhaseeb123 Nov 25, 2025
0ef323c
Address feedback
Dec 3, 2025
d33c9ee
Address partial feedback
mhaseeb123 Dec 3, 2025
e056495
Rename `read_parquet_metadata` to `read_parquet_footers`
mhaseeb123 Dec 3, 2025
9e00764
Add some extra checks
mhaseeb123 Dec 4, 2025
25b0702
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Dec 10, 2025
f938501
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Dec 10, 2025
d84f08f
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Dec 10, 2025
9fbe8bd
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Dec 10, 2025
b5a298f
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Dec 11, 2025
4bc5e10
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Dec 13, 2025
5600ae5
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Dec 15, 2025
87e3a2e
Merge branch 'main' into fea/read-parquet-with-pre-populated-footer
mhaseeb123 Dec 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cpp/include/cudf/io/datasource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,21 @@ class datasource {
};
};

/**
* @brief Constructs datasources from dataset source information
*
* @ingroup io_datasources
*
* @param info Dataset source information
* @param offset Starting byte offset from which data will be read (default zero)
* @param max_size_estimate Upper estimate of the data range that will be read (default zero,
* which means the entire file after `offset`)
* @return Constructed vector of datasource objects
*/
std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
size_t offset = 0,
size_t max_size_estimate = 0);

/** @} */ // end of group
} // namespace io
} // namespace CUDF_EXPORT cudf
15 changes: 15 additions & 0 deletions cpp/include/cudf/io/detail/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ class reader {
* @brief Constructor from an array of datasources
*
* @param sources Input `datasource` objects to read the dataset from
* @param parquet_metadatas Pre-materialized Parquet file metadata(s). Read from sources if empty
* @param options Settings for controlling reading behavior
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource to use for device memory allocation
*/
explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
std::vector<FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
Expand Down Expand Up @@ -134,13 +136,15 @@ class chunked_reader : private reader {
* @param pass_read_limit Limit on total amount of memory used for temporary computations during
* loading, or `0` if there is no limit
* @param sources Input `datasource` objects to read the dataset from
* @param parquet_metadatas Pre-materialized Parquet file metadata(s). Read from sources if empty
* @param options Settings for controlling reading behavior
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
explicit chunked_reader(std::size_t chunk_read_limit,
std::size_t pass_read_limit,
std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
std::vector<parquet::FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
Expand Down Expand Up @@ -248,6 +252,17 @@ class writer {
* metadata.
*/
parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);

/**
* @brief Constructs FileMetaData objects from parquet dataset
*
* @param sources Input `datasource` objects to read the dataset from
*
* @return List of FileMetaData objects, one per parquet source
*/
std::vector<parquet::FileMetaData> read_parquet_footers(
host_span<std::unique_ptr<datasource> const> sources);

} // namespace parquet::detail
} // namespace io
} // namespace CUDF_EXPORT cudf
9 changes: 5 additions & 4 deletions cpp/include/cudf/io/orc_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ namespace io {
* @file
*/

//! ORC data type
using cudf::io::orc::TypeKind;

/**
* @brief Holds column names and buffers containing raw file-level and stripe-level statistics.
*
Expand Down Expand Up @@ -224,9 +227,7 @@ struct orc_column_schema {
* @param type ORC type
* @param children child columns (empty for non-nested types)
*/
orc_column_schema(std::string_view name,
orc::TypeKind type,
std::vector<orc_column_schema> children)
orc_column_schema(std::string_view name, TypeKind type, std::vector<orc_column_schema> children)
: _name{name}, _type_kind{type}, _children{std::move(children)}
{
}
Expand Down Expand Up @@ -282,7 +283,7 @@ struct orc_column_schema {

private:
std::string _name;
orc::TypeKind _type_kind;
TypeKind _type_kind;
std::vector<orc_column_schema> _children;
};

Expand Down
83 changes: 83 additions & 0 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,34 @@ table_with_metadata read_parquet(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Reads a Parquet dataset into a set of columns using pre-existing Parquet datasources and
* file metadatas.
*
* The following code snippet demonstrates how to read a dataset from a file:
* @code
* auto sources = cudf::io::make_datasources(cudf::io::source_info("dataset.parquet"));
* auto metadatas = cudf::io::read_parquet_metadata(sources);
* auto options = cudf::io::parquet_reader_options::builder();
* auto result = cudf::io::read_parquet(std::move(sources), std::move(metadatas), options);
* @endcode
*
* @param sources Input `datasource` objects to read the dataset from
* @param parquet_metadatas Pre-materialized Parquet file metadata(s). Read from sources if empty
* @param options Settings for controlling reading behavior
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate device memory of the table in the returned
* table_with_metadata
*
* @return The set of columns along with metadata
*/
table_with_metadata read_parquet(
std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
std::vector<parquet::FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief The chunked parquet reader class to read Parquet file iteratively in to a series of
* tables, chunk by chunk.
Expand Down Expand Up @@ -705,6 +733,30 @@ class chunked_parquet_reader {
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Constructor for chunked reader using pre-existing Parquet datasources and
* file metadatas.
*
* This constructor requires the same `parquet_reader_option` parameter as in
* `cudf::read_parquet()`, and an additional parameter to specify the size byte limit of the
* output table for each reading.
*
* @param chunk_read_limit Limit on total number of bytes to be returned per read,
* or `0` if there is no limit
* @param sources Input `datasource` objects to read the dataset from
* @param parquet_metadatas Pre-materialized Parquet file metadata(s). Read from sources if empty
* @param options The options used to read Parquet file
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
chunked_parquet_reader(
std::size_t chunk_read_limit,
std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
std::vector<parquet::FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Constructor for chunked reader.
*
Expand All @@ -731,6 +783,37 @@ class chunked_parquet_reader {
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Constructor for chunked reader using pre-existing Parquet datasources and
* file metadatas.
*
* This constructor requires the same `parquet_reader_option` parameter as in
* `cudf::read_parquet()`, with additional parameters to specify the size byte limit of the
* output table for each reading, and a byte limit on the amount of temporary memory to use
* when reading. pass_read_limit affects how many row groups we can read at a time by limiting
* the amount of memory dedicated to decompression space. pass_read_limit is a hint, not an
* absolute limit - if a single row group cannot fit within the limit given, it will still be
* loaded.
*
* @param chunk_read_limit Limit on total number of bytes to be returned per read,
* or `0` if there is no limit
* @param pass_read_limit Limit on the amount of memory used for reading and decompressing data or
* `0` if there is no limit
* @param sources Input `datasource` objects to read the dataset from
* @param parquet_metadatas Pre-materialized Parquet file metadata(s). Read from sources if empty
* @param options The options used to read Parquet file
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
chunked_parquet_reader(
std::size_t chunk_read_limit,
std::size_t pass_read_limit,
std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
std::vector<parquet::FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Destructor, destroying the internal reader instance.
*
Expand Down
15 changes: 14 additions & 1 deletion cpp/include/cudf/io/parquet_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#pragma once

#include <cudf/io/datasource.hpp>
#include <cudf/io/parquet_schema.hpp>
#include <cudf/io/types.hpp>
#include <cudf/utilities/export.hpp>
Expand Down Expand Up @@ -262,13 +263,25 @@ class parquet_metadata {
*
* @ingroup io_readers
*
* @param src_info Dataset source
* @param src_info Dataset source information
*
* @return parquet_metadata with parquet schema, number of rows, number of row groups and key-value
* metadata
*/
parquet_metadata read_parquet_metadata(source_info const& src_info);

/**
* @brief Constructs FileMetaData objects from parquet dataset
*
* @ingroup io_readers
*
* @param sources Input `datasource` objects to read the dataset from
*
* @return List of FileMetaData objects, one per parquet source
*/
std::vector<parquet::FileMetaData> read_parquet_metadata(
cudf::host_span<std::unique_ptr<cudf::io::datasource> const> sources);

/** @} */ // end of group
} // namespace io
} // namespace CUDF_EXPORT cudf
88 changes: 80 additions & 8 deletions cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf/io/orc_metadata.hpp>
#include <cudf/io/parquet.hpp>
#include <cudf/io/parquet_metadata.hpp>
#include <cudf/io/parquet_schema.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

Expand Down Expand Up @@ -150,11 +151,12 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
return chunked_parquet_writer_options_builder{sink};
}

namespace {

/**
* @copydoc cudf::io::make_datasources
*/
std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
size_t offset = 0,
size_t max_size_estimate = 0)
size_t offset,
size_t max_size_estimate)
{
switch (info.type()) {
case io_type::FILEPATH: {
Expand Down Expand Up @@ -188,6 +190,8 @@ std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info
}
}

namespace {

std::vector<std::unique_ptr<data_sink>> make_datasinks(sink_info const& info)
{
switch (info.type()) {
Expand Down Expand Up @@ -612,8 +616,22 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
CUDF_FUNC_RANGE();

auto datasources = make_datasources(options.get_source());
auto reader =
std::make_unique<detail_parquet::reader>(std::move(datasources), options, stream, mr);
auto reader = std::make_unique<detail_parquet::reader>(
std::move(datasources), std::vector<parquet::FileMetaData>{}, options, stream, mr);

return reader->read();
}

table_with_metadata read_parquet(std::vector<std::unique_ptr<cudf::io::datasource>>&& datasources,
std::vector<parquet::FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();

auto reader = std::make_unique<detail_parquet::reader>(
std::move(datasources), std::move(parquet_metadatas), options, stream, mr);

return reader->read();
}
Expand All @@ -626,6 +644,13 @@ parquet_metadata read_parquet_metadata(source_info const& src_info)
return detail_parquet::read_parquet_metadata(datasources);
}

std::vector<parquet::FileMetaData> read_parquet_metadata(
host_span<std::unique_ptr<cudf::io::datasource> const> sources)
{
CUDF_FUNC_RANGE();
return detail_parquet::read_parquet_footers(sources);
}

/**
* @copydoc cudf::io::merge_row_group_metadata
*/
Expand Down Expand Up @@ -700,8 +725,33 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
parquet_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
: reader{std::make_unique<detail_parquet::chunked_reader>(
chunk_read_limit, 0, make_datasources(options.get_source()), options, stream, mr)}
: reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
0,
make_datasources(options.get_source()),
std::vector<parquet::FileMetaData>{},
options,
stream,
mr)}
{
}

/**
* @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader
*/
chunked_parquet_reader::chunked_parquet_reader(
std::size_t chunk_read_limit,
std::vector<std::unique_ptr<cudf::io::datasource>>&& datasources,
std::vector<parquet::FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
: reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
0,
std::move(datasources),
std::move(parquet_metadatas),
options,
stream,
mr)}
{
}

Expand All @@ -716,6 +766,28 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
: reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
pass_read_limit,
make_datasources(options.get_source()),
std::vector<parquet::FileMetaData>{},
options,
stream,
mr)}
{
}

/**
* @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader
*/
chunked_parquet_reader::chunked_parquet_reader(
std::size_t chunk_read_limit,
std::size_t pass_read_limit,
std::vector<std::unique_ptr<cudf::io::datasource>>&& datasources,
std::vector<parquet::FileMetaData>&& parquet_metadatas,
parquet_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
: reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
pass_read_limit,
std::move(datasources),
std::move(parquet_metadatas),
options,
stream,
mr)}
Expand Down
Loading
Loading