From 63c82cb76cf9d39469353f14a14a8fe1d98c87df Mon Sep 17 00:00:00 2001 From: Christian Bush Date: Fri, 20 Feb 2026 14:16:05 -0800 Subject: [PATCH] Task #1: Add OrcSchemaManifest and OrcSchemaField structures - Added OrcSchemaField struct to map Arrow fields to ORC column indices - Added OrcSchemaManifest struct for schema mapping infrastructure - Includes GetColumnField() and GetParent() helper methods - Added stub Make() implementation (full logic in Task #2) - Mirrors Parquet SchemaManifest design adapted for ORC type system Verified: Code structure matches Parquet pattern Co-Authored-By: Claude Sonnet 4.5 --- cpp/src/arrow/dataset/file_orc.cc | 10 +++++ cpp/src/arrow/dataset/file_orc.h | 69 +++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/cpp/src/arrow/dataset/file_orc.cc b/cpp/src/arrow/dataset/file_orc.cc index 1393df57f9d7..8cb44cd5c0be 100644 --- a/cpp/src/arrow/dataset/file_orc.cc +++ b/cpp/src/arrow/dataset/file_orc.cc @@ -35,6 +35,16 @@ using internal::checked_pointer_cast; namespace dataset { +// OrcSchemaManifest implementation +Status OrcSchemaManifest::Make(const std::shared_ptr& schema, + const void* orc_type, OrcSchemaManifest* manifest) { + // TODO(Task #2): Implement BuildOrcSchemaManifest logic + // This is a placeholder for Task #1 - actual implementation in Task #2 + manifest->origin_schema = schema; + return Status::NotImplemented( + "OrcSchemaManifest::Make will be implemented in Task #2"); +} + namespace { Result> OpenORCReader( diff --git a/cpp/src/arrow/dataset/file_orc.h b/cpp/src/arrow/dataset/file_orc.h index 5bfefd1e02b5..3c36dcd5d8d9 100644 --- a/cpp/src/arrow/dataset/file_orc.h +++ b/cpp/src/arrow/dataset/file_orc.h @@ -21,12 +21,16 @@ #include #include +#include +#include #include "arrow/dataset/file_base.h" #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" #include "arrow/io/type_fwd.h" #include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" namespace arrow { namespace dataset { @@ -35,6 +39,71 @@ namespace dataset { /// /// @{ +/// \brief Bridge between an arrow::Field and ORC column indices. +/// +/// Similar to Parquet's SchemaField, this structure maps Arrow schema fields +/// to ORC physical column indices. ORC uses a depth-first pre-order traversal +/// where column 0 is the root struct, and subsequent indices are for child columns. +struct ARROW_DS_EXPORT OrcSchemaField { + /// The Arrow field corresponding to this ORC column + std::shared_ptr field; + + /// Child fields (for nested types like structs, lists, maps) + std::vector children; + + /// ORC column index (only set for leaf nodes that have statistics) + /// For ORC, column 0 is the root struct, columns 1+ are the actual data columns + int column_index = -1; + + /// Check if this is a leaf node (has column statistics) + bool is_leaf() const { return column_index != -1; } +}; + +/// \brief Bridge between an ORC file schema and an Arrow Schema. +/// +/// Maps Arrow schema fields to ORC physical column indices for statistics lookup. +/// Similar to Parquet's SchemaManifest but adapted for ORC's type system. +struct ARROW_DS_EXPORT OrcSchemaManifest { + /// Create a schema manifest from ORC type information + /// \param schema The Arrow schema + /// \param orc_type Pointer to orc::Type from the ORC reader (as void* to avoid ORC header dependency) + /// \param manifest Output manifest to populate + static Status Make(const std::shared_ptr& schema, const void* orc_type, + OrcSchemaManifest* manifest); + + /// The Arrow schema + std::shared_ptr origin_schema; + + /// Top-level schema fields + std::vector schema_fields; + + /// Map from ORC column index to schema field (for fast lookup) + std::unordered_map column_index_to_field; + + /// Map from child field to parent field (for traversal) + std::unordered_map child_to_parent; + + /// Get the schema field for a given ORC column index + Status GetColumnField(int column_index, const OrcSchemaField** out) const { + auto it = column_index_to_field.find(column_index); + if (it == column_index_to_field.end()) { + return Status::KeyError("Column index ", column_index, + " not found in ORC schema manifest"); + } + *out = it->second; + return Status::OK(); + } + + /// Get the parent field of a given field + const OrcSchemaField* GetParent(const OrcSchemaField* field) const { + auto it = child_to_parent.find(field); + if (it == child_to_parent.end()) { + return nullptr; + } + return it->second; + } +}; + constexpr char kOrcTypeName[] = "orc"; /// \brief A FileFormat implementation that reads from and writes to ORC files