Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/src/arrow/dataset/file_orc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ using internal::checked_pointer_cast;

namespace dataset {

// OrcSchemaManifest implementation
Status OrcSchemaManifest::Make(const std::shared_ptr<Schema>& schema,
const void* orc_type, OrcSchemaManifest* manifest) {
// TODO(Task #2): Implement BuildOrcSchemaManifest logic
// This is a placeholder for Task #1 - actual implementation in Task #2
manifest->origin_schema = schema;
return Status::NotImplemented(
"OrcSchemaManifest::Make will be implemented in Task #2");
}

namespace {

Result<std::unique_ptr<arrow::adapters::orc::ORCFileReader>> OpenORCReader(
Expand Down
69 changes: 69 additions & 0 deletions cpp/src/arrow/dataset/file_orc.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@

#include <memory>
#include <string>
#include <unordered_map>
#include <vector>

#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
#include "arrow/io/type_fwd.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"

namespace arrow {
namespace dataset {
Expand All @@ -35,6 +39,71 @@ namespace dataset {
///
/// @{

/// \brief Bridge between an arrow::Field and ORC column indices.
///
/// Similar to Parquet's SchemaField, this structure maps Arrow schema fields
/// to ORC physical column indices. ORC uses a depth-first pre-order traversal
/// where column 0 is the root struct, and subsequent indices are for child columns.
struct ARROW_DS_EXPORT OrcSchemaField {
/// The Arrow field corresponding to this ORC column
std::shared_ptr<Field> field;

/// Child fields (for nested types like structs, lists, maps)
std::vector<OrcSchemaField> children;

/// ORC column index (only set for leaf nodes that have statistics)
/// For ORC, column 0 is the root struct, columns 1+ are the actual data columns
int column_index = -1;

/// Check if this is a leaf node (has column statistics)
bool is_leaf() const { return column_index != -1; }
};

/// \brief Bridge between an ORC file schema and an Arrow Schema.
///
/// Maps Arrow schema fields to ORC physical column indices for statistics lookup.
/// Similar to Parquet's SchemaManifest but adapted for ORC's type system.
struct ARROW_DS_EXPORT OrcSchemaManifest {
/// Create a schema manifest from ORC type information
/// \param schema The Arrow schema
/// \param orc_type Pointer to orc::Type from the ORC reader (as void* to avoid ORC header dependency)
/// \param manifest Output manifest to populate
static Status Make(const std::shared_ptr<Schema>& schema, const void* orc_type,
OrcSchemaManifest* manifest);

/// The Arrow schema
std::shared_ptr<Schema> origin_schema;

/// Top-level schema fields
std::vector<OrcSchemaField> schema_fields;

/// Map from ORC column index to schema field (for fast lookup)
std::unordered_map<int, const OrcSchemaField*> column_index_to_field;

/// Map from child field to parent field (for traversal)
std::unordered_map<const OrcSchemaField*, const OrcSchemaField*> child_to_parent;

/// Get the schema field for a given ORC column index
Status GetColumnField(int column_index, const OrcSchemaField** out) const {
auto it = column_index_to_field.find(column_index);
if (it == column_index_to_field.end()) {
return Status::KeyError("Column index ", column_index,
" not found in ORC schema manifest");
}
*out = it->second;
return Status::OK();
}

/// Get the parent field of a given field
const OrcSchemaField* GetParent(const OrcSchemaField* field) const {
auto it = child_to_parent.find(field);
if (it == child_to_parent.end()) {
return nullptr;
}
return it->second;
}
};

constexpr char kOrcTypeName[] = "orc";

/// \brief A FileFormat implementation that reads from and writes to ORC files
Expand Down
Loading