Skip to content

Commit a7d0d92

Browse files
authored
GDV-13: [C++] Add support for filters (apache#75)
- similar to projection, filter is built for a specific schema and condition (i.e expression) - the output of filter is a selection vector (Int16Array)
1 parent 0cae753 commit a7d0d92

20 files changed

Lines changed: 1142 additions & 71 deletions

include/gandiva/condition.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Copyright (C) 2017-2018 Dremio Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#ifndef GANDIVA_CONDITION_H
17+
#define GANDIVA_CONDITION_H
18+
19+
#include "gandiva/arrow.h"
20+
#include "gandiva/expression.h"
21+
#include "gandiva/gandiva_aliases.h"
22+
23+
namespace gandiva {
24+
25+
/// \brief A condition expression.
26+
class Condition : public Expression {
27+
public:
28+
Condition(const NodePtr root)
29+
: Expression(root, std::make_shared<arrow::Field>("cond", arrow::boolean())) {}
30+
31+
virtual ~Condition() = default;
32+
};
33+
34+
} // namespace gandiva
35+
36+
#endif // GANDIVA_CONDITION_H

include/gandiva/expression.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class Expression {
2626
public:
2727
Expression(const NodePtr root, const FieldPtr result) : root_(root), result_(result) {}
2828

29+
virtual ~Expression() = default;
30+
2931
const NodePtr &root() const { return root_; }
3032

3133
const FieldPtr &result() const { return result_; }

include/gandiva/filter.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright (C) 2017-2018 Dremio Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either condess or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#ifndef GANDIVA_EXPR_FILTER_H
17+
#define GANDIVA_EXPR_FILTER_H
18+
19+
#include <memory>
20+
#include <string>
21+
#include <utility>
22+
#include <vector>
23+
24+
#include "gandiva/arrow.h"
25+
#include "gandiva/condition.h"
26+
#include "gandiva/configuration.h"
27+
#include "gandiva/selection_vector.h"
28+
#include "gandiva/status.h"
29+
30+
namespace gandiva {
31+
32+
class LLVMGenerator;
33+
34+
/// \brief filter records based on a condition.
35+
///
36+
/// A filter is built for a specific schema and condition. Once the filter is built, it
37+
/// can be used to evaluate many row batches.
38+
class Filter {
39+
public:
40+
Filter(std::unique_ptr<LLVMGenerator> llvm_generator, SchemaPtr schema,
41+
std::shared_ptr<Configuration> config);
42+
43+
~Filter() = default;
44+
45+
/// Build a filter for the given schema and condition, with the default configuration.
46+
///
47+
/// \param[in] : schema schema for the record batches, and the condition.
48+
/// \param[in] : condition filter condition.
49+
/// \param[out]: filter the returned filter object
50+
static Status Make(SchemaPtr schema, ConditionPtr condition,
51+
std::shared_ptr<Filter> *filter) {
52+
return Make(schema, condition, ConfigurationBuilder::DefaultConfiguration(), filter);
53+
}
54+
55+
/// \brief Build a filter for the given schema and condition.
56+
/// Customize the filter with runtime configuration.
57+
///
58+
/// \param[in] : schema schema for the record batches, and the condition.
59+
/// \param[in] : condition filter conditions.
60+
/// \param[in] : config run time configuration.
61+
/// \param[out]: filter the returned filter object
62+
static Status Make(SchemaPtr schema, ConditionPtr condition,
63+
std::shared_ptr<Configuration> config,
64+
std::shared_ptr<Filter> *filter);
65+
66+
/// Evaluate the specified record batch, and populate output selection vector.
67+
///
68+
/// \param[in] : batch the record batch. schema should be the same as the one in 'Make'
69+
/// \param[in/out]: out_selection the selection array with indices of rows that match
70+
/// the condition.
71+
Status Evaluate(const arrow::RecordBatch &batch,
72+
std::shared_ptr<SelectionVector> out_selection);
73+
74+
private:
75+
const std::unique_ptr<LLVMGenerator> llvm_generator_;
76+
const SchemaPtr schema_;
77+
const std::shared_ptr<Configuration> configuration_;
78+
};
79+
80+
} // namespace gandiva
81+
82+
#endif // GANDIVA_EXPR_FILTER_H

include/gandiva/gandiva_aliases.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ class Expression;
4242
using ExpressionPtr = std::shared_ptr<Expression>;
4343
using ExpressionVector = std::vector<ExpressionPtr>;
4444

45+
class Condition;
46+
using ConditionPtr = std::shared_ptr<Condition>;
47+
4548
class Node;
4649
using NodePtr = std::shared_ptr<Node>;
4750
using NodeVector = std::vector<std::shared_ptr<Node>>;

include/gandiva/selection_vector.h

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// Copyright (C) 2017-2018 Dremio Corporation
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#ifndef GANDIVA_SELECTION_VECTOR__H
16+
#define GANDIVA_SELECTION_VECTOR__H
17+
18+
#include "gandiva/arrow.h"
19+
#include "gandiva/logging.h"
20+
#include "gandiva/status.h"
21+
22+
namespace gandiva {
23+
24+
/// \brief Selection Vector : vector of indices in a row-batch for a selection,
25+
/// backed by an arrow-array.
26+
class SelectionVector {
27+
public:
28+
~SelectionVector() = default;
29+
30+
/// Get the value at a given index.
31+
virtual int GetIndex(int index) const = 0;
32+
33+
/// Set the value at a given index.
34+
virtual void SetIndex(int index, int value) = 0;
35+
36+
// Get the max supported value in the selection vector.
37+
virtual int GetMaxSupportedValue() const = 0;
38+
39+
/// The maximum slots (capacity) of the selection vector.
40+
virtual int GetMaxSlots() const = 0;
41+
42+
/// The number of slots (size) of the selection vector.
43+
virtual int GetNumSlots() const = 0;
44+
45+
/// Set the number of slots in the selection vector.
46+
virtual void SetNumSlots(int num_slots) = 0;
47+
48+
/// Convert to arrow-array.
49+
virtual ArrayPtr ToArray() const = 0;
50+
51+
/// populate selection vector for all the set bits in the bitmap.
52+
///
53+
/// \param[in] : bitmap the bitmap
54+
/// \param[in] : bitmap_size size of the bitmap in bytes
55+
/// \param[in] : max_bitmap_index max valid index in bitmap (can be lesser than
56+
/// capacity in the bitmap, due to alignment/padding).
57+
Status PopulateFromBitMap(const uint8_t *bitmap, int bitmap_size, int max_bitmap_index);
58+
};
59+
60+
/// \brief template implementation of selection vector with a specific ctype and arrow
61+
/// type.
62+
template <typename C_TYPE, typename A_TYPE>
63+
class SelectionVectorImpl : public SelectionVector {
64+
public:
65+
SelectionVectorImpl(int max_slots, std::shared_ptr<arrow::Buffer> buffer)
66+
: max_slots_(max_slots), num_slots_(0), buffer_(buffer) {
67+
raw_data_ = reinterpret_cast<C_TYPE *>(buffer->mutable_data());
68+
}
69+
70+
int GetIndex(int index) const override {
71+
DCHECK_LE(index, max_slots_);
72+
return raw_data_[index];
73+
}
74+
75+
void SetIndex(int index, int value) override {
76+
DCHECK_LE(index, max_slots_);
77+
DCHECK_LE(value, GetMaxSupportedValue());
78+
79+
raw_data_[index] = value;
80+
}
81+
82+
ArrayPtr ToArray() const override;
83+
84+
int GetMaxSlots() const override { return max_slots_; }
85+
86+
int GetNumSlots() const override { return num_slots_; }
87+
88+
void SetNumSlots(int num_slots) override {
89+
DCHECK_LE(num_slots, max_slots_);
90+
num_slots_ = num_slots;
91+
}
92+
93+
protected:
94+
static Status AllocateBuffer(int max_slots, arrow::MemoryPool *pool,
95+
std::shared_ptr<arrow::Buffer> *buffer);
96+
97+
static Status ValidateBuffer(int max_slots, std::shared_ptr<arrow::Buffer> buffer);
98+
99+
/// maximum slots in the vector
100+
int max_slots_;
101+
102+
/// number of slots in the vector
103+
int num_slots_;
104+
105+
std::shared_ptr<arrow::Buffer> buffer_;
106+
C_TYPE *raw_data_;
107+
};
108+
109+
template <typename C_TYPE, typename A_TYPE>
110+
ArrayPtr SelectionVectorImpl<C_TYPE, A_TYPE>::ToArray() const {
111+
auto data_type = arrow::TypeTraits<A_TYPE>::type_singleton();
112+
auto array_data = arrow::ArrayData::Make(data_type, num_slots_, {nullptr, buffer_});
113+
return arrow::MakeArray(array_data);
114+
}
115+
116+
class SelectionVectorInt16 : public SelectionVectorImpl<int16_t, arrow::Int16Type> {
117+
public:
118+
SelectionVectorInt16(int max_slots, std::shared_ptr<arrow::Buffer> buffer)
119+
: SelectionVectorImpl(max_slots, buffer) {}
120+
121+
int GetMaxSupportedValue() const override { return INT16_MAX; }
122+
123+
/// \param[in] : max_slots max number of slots
124+
/// \param[in] : buffer buffer sized to accomodate max_slots
125+
/// \param[out]: selection_vector selection vector backed by 'buffer'
126+
static Status Make(int max_slots, std::shared_ptr<arrow::Buffer> buffer,
127+
std::shared_ptr<SelectionVectorInt16> *selection_vector);
128+
129+
/// \param[in] : max_slots max number of slots
130+
/// \param[in] : pool memory pool to allocate buffer
131+
/// \param[out]: selection_vector selection vector backed by a buffer allocated from the
132+
/// pool.
133+
static Status Make(int max_slots, arrow::MemoryPool *pool,
134+
std::shared_ptr<SelectionVectorInt16> *selection_vector);
135+
};
136+
137+
class SelectionVectorInt32 : public SelectionVectorImpl<int32_t, arrow::Int32Type> {
138+
public:
139+
SelectionVectorInt32(int max_slots, std::shared_ptr<arrow::Buffer> buffer)
140+
: SelectionVectorImpl(max_slots, buffer) {}
141+
142+
int GetMaxSupportedValue() const override { return INT32_MAX; }
143+
144+
/// \param[in] : max_slots max number of slots
145+
/// \param[in] : buffer buffer sized to accomodate max_slots
146+
/// \param[out]: selection_vector selection vector backed by 'buffer'
147+
static Status Make(int max_slots, std::shared_ptr<arrow::Buffer> buffer,
148+
std::shared_ptr<SelectionVectorInt32> *selection_vector);
149+
150+
/// \param[in] : max_slots max number of slots
151+
/// \param[in] : pool memory pool to allocate buffer
152+
/// \param[out]: selection_vector selection vector backed by a buffer allocated from the
153+
/// pool.
154+
static Status Make(int max_slots, arrow::MemoryPool *pool,
155+
std::shared_ptr<SelectionVectorInt32> *selection_vector);
156+
};
157+
158+
} // namespace gandiva
159+
160+
#endif // GANDIVA_SELECTION_VECTOR__H

include/gandiva/status.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
#define GANDIVA_RETURN_FAILURE_IF_FALSE(condition, status) \
3737
do { \
38-
if (!condition) { \
38+
if (!(condition)) { \
3939
Status _status = (status); \
4040
std::stringstream ss; \
4141
ss << __FILE__ << ":" << __LINE__ << " code: " << _status.CodeAsString() << " \n " \
@@ -197,4 +197,4 @@ inline Status& Status::operator&=(Status&& s) {
197197
}
198198

199199
} // namespace gandiva
200-
#endif // GANDIVA_STATUS_H
200+
#endif // GANDIVA_STATUS_H

include/gandiva/tree_expr_builder.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <string>
2020
#include <vector>
2121

22+
#include "gandiva/condition.h"
2223
#include "gandiva/expression.h"
2324

2425
namespace gandiva {
@@ -74,6 +75,13 @@ class TreeExprBuilder {
7475
/// returns null if the out_field is null.
7576
static ExpressionPtr MakeExpression(const std::string &function,
7677
const FieldVector &in_fields, FieldPtr out_field);
78+
79+
/// \brief create a condition with the specified root_node
80+
static ConditionPtr MakeCondition(NodePtr root_node);
81+
82+
/// \brief convenience function for simple function conditions.
83+
static ConditionPtr MakeCondition(const std::string &function,
84+
const FieldVector &in_fields);
7785
};
7886

7987
} // namespace gandiva

integ/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
project(gandiva)
1616

1717
foreach(lib_type "shared" "static")
18+
add_gandiva_integ_test(filter_test.cc gandiva_${lib_type})
1819
add_gandiva_integ_test(projector_test.cc gandiva_${lib_type})
1920
add_gandiva_integ_test(if_expr_test.cc gandiva_${lib_type})
2021
add_gandiva_integ_test(literal_test.cc gandiva_${lib_type})

0 commit comments

Comments
 (0)