Skip to content

Commit 787afa1

Browse files
GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes (#39652)
### Rationale for this change First step for #39633: exposing the Array, DataType and Scalar classes for BinaryView and StringView, such that those can already be represented in pyarrow. (I exposed a variant of StringBuilder as well, just for now to be able to create test data) * Closes: #39651 Authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
1 parent c6ab286 commit 787afa1

16 files changed

Lines changed: 223 additions & 6 deletions

File tree

docs/source/python/api/arrays.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ may expose data type-specific methods or properties.
6363
FixedSizeBinaryArray
6464
LargeBinaryArray
6565
LargeStringArray
66+
BinaryViewArray,
67+
StringViewArray,
6668
Time32Array
6769
Time64Array
6870
Date32Array
@@ -119,6 +121,8 @@ classes may expose data type-specific methods or properties.
119121
FixedSizeBinaryScalar
120122
LargeBinaryScalar
121123
LargeStringScalar
124+
BinaryViewScalar
125+
StringViewScalar
122126
Time32Scalar
123127
Time64Scalar
124128
Date32Scalar

docs/source/python/api/datatypes.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas.
5555
large_binary
5656
large_string
5757
large_utf8
58+
binary_view
59+
string_view
5860
decimal128
5961
list_
6062
large_list
@@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general category
168170
is_large_binary
169171
is_large_unicode
170172
is_large_string
173+
is_binary_view
174+
is_string_view
171175
is_fixed_size_binary
172176
is_map
173177
is_dictionary

python/pyarrow/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def print_entry(label, value):
163163
time32, time64, timestamp, date32, date64, duration,
164164
month_day_nano_interval,
165165
float16, float32, float64,
166-
binary, string, utf8,
166+
binary, string, utf8, binary_view, string_view,
167167
large_binary, large_string, large_utf8,
168168
decimal128, decimal256,
169169
list_, large_list, map_, struct,
@@ -205,6 +205,7 @@ def print_entry(label, value):
205205
FixedSizeListArray, UnionArray,
206206
BinaryArray, StringArray,
207207
LargeBinaryArray, LargeStringArray,
208+
BinaryViewArray, StringViewArray,
208209
FixedSizeBinaryArray,
209210
DictionaryArray,
210211
Date32Array, Date64Array, TimestampArray,
@@ -223,8 +224,8 @@ def print_entry(label, value):
223224
Time32Scalar, Time64Scalar,
224225
TimestampScalar, DurationScalar,
225226
MonthDayNanoIntervalScalar,
226-
BinaryScalar, LargeBinaryScalar,
227-
StringScalar, LargeStringScalar,
227+
BinaryScalar, LargeBinaryScalar, BinaryViewScalar,
228+
StringScalar, LargeStringScalar, StringViewScalar,
228229
FixedSizeBinaryScalar, DictionaryScalar,
229230
MapScalar, StructScalar, UnionScalar,
230231
RunEndEncodedScalar, ExtensionScalar)

python/pyarrow/array.pxi

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2942,6 +2942,12 @@ cdef class LargeStringArray(Array):
29422942
null_count, offset)
29432943

29442944

2945+
cdef class StringViewArray(Array):
2946+
"""
2947+
Concrete class for Arrow arrays of string (or utf8) view data type.
2948+
"""
2949+
2950+
29452951
cdef class BinaryArray(Array):
29462952
"""
29472953
Concrete class for Arrow arrays of variable-sized binary data type.
@@ -2968,6 +2974,12 @@ cdef class LargeBinaryArray(Array):
29682974
return (<CLargeBinaryArray*> self.ap).total_values_length()
29692975

29702976

2977+
cdef class BinaryViewArray(Array):
2978+
"""
2979+
Concrete class for Arrow arrays of variable-sized binary view data type.
2980+
"""
2981+
2982+
29712983
cdef class DictionaryArray(Array):
29722984
"""
29732985
Concrete class for dictionary-encoded Arrow arrays.
@@ -3669,6 +3681,8 @@ cdef dict _array_classes = {
36693681
_Type_STRING: StringArray,
36703682
_Type_LARGE_BINARY: LargeBinaryArray,
36713683
_Type_LARGE_STRING: LargeStringArray,
3684+
_Type_BINARY_VIEW: BinaryViewArray,
3685+
_Type_STRING_VIEW: StringViewArray,
36723686
_Type_DICTIONARY: DictionaryArray,
36733687
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
36743688
_Type_DECIMAL128: Decimal128Array,

python/pyarrow/builder.pxi

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable):
8080

8181
def __len__(self):
8282
return self.builder.get().length()
83+
84+
85+
cdef class StringViewBuilder(_Weakrefable):
86+
"""
87+
Builder class for UTF8 string views.
88+
89+
This class exposes facilities for incrementally adding string values and
90+
building the null bitmap for a pyarrow.Array (type='string_view').
91+
"""
92+
cdef:
93+
unique_ptr[CStringViewBuilder] builder
94+
95+
def __cinit__(self, MemoryPool memory_pool=None):
96+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
97+
self.builder.reset(new CStringViewBuilder(pool))
98+
99+
def append(self, value):
100+
"""
101+
Append a single value to the builder.
102+
103+
The value can either be a string/bytes object or a null value
104+
(np.nan or None).
105+
106+
Parameters
107+
----------
108+
value : string/bytes or np.nan/None
109+
The value to append to the string array builder.
110+
"""
111+
if value is None or value is np.nan:
112+
self.builder.get().AppendNull()
113+
elif isinstance(value, (bytes, str)):
114+
self.builder.get().Append(tobytes(value))
115+
else:
116+
raise TypeError('StringViewBuilder only accepts string objects')
117+
118+
def append_values(self, values):
119+
"""
120+
Append all the values from an iterable.
121+
122+
Parameters
123+
----------
124+
values : iterable of string/bytes or np.nan/None values
125+
The values to append to the string array builder.
126+
"""
127+
for value in values:
128+
self.append(value)
129+
130+
def finish(self):
131+
"""
132+
Return result of builder as an Array object; also resets the builder.
133+
134+
Returns
135+
-------
136+
array : pyarrow.Array
137+
"""
138+
cdef shared_ptr[CArray] out
139+
with nogil:
140+
self.builder.get().Finish(&out)
141+
return pyarrow_wrap_array(out)
142+
143+
@property
144+
def null_count(self):
145+
return self.builder.get().null_count()
146+
147+
def __len__(self):
148+
return self.builder.get().length()

python/pyarrow/includes/libarrow.pxd

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
126126
_Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
127127
_Type_LARGE_STRING" arrow::Type::LARGE_STRING"
128128
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
129+
_Type_BINARY_VIEW" arrow::Type::BINARY_VIEW"
130+
_Type_STRING_VIEW" arrow::Type::STRING_VIEW"
129131

130132
_Type_LIST" arrow::Type::LIST"
131133
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
@@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil:
12951297

12961298
cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder):
12971299
CStringBuilder(CMemoryPool* pool)
1300+
CStatus Append(const c_string& value)
1301+
1302+
cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder):
1303+
CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool)
1304+
CStatus Append(const char* value, int32_t length)
12981305

1306+
cdef cppclass CStringViewBuilder" arrow::StringViewBuilder"(CBinaryViewBuilder):
1307+
CStringViewBuilder(CMemoryPool* pool)
12991308
CStatus Append(const c_string& value)
13001309

13011310
cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder):

python/pyarrow/lib.pxd

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,14 @@ cdef class BinaryArray(Array):
445445
pass
446446

447447

448+
cdef class StringViewArray(Array):
449+
pass
450+
451+
452+
cdef class BinaryViewArray(Array):
453+
pass
454+
455+
448456
cdef class DictionaryArray(Array):
449457
cdef:
450458
object _indices, _dictionary

python/pyarrow/lib.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ Type_STRING = _Type_STRING
106106
Type_LARGE_BINARY = _Type_LARGE_BINARY
107107
Type_LARGE_STRING = _Type_LARGE_STRING
108108
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
109+
Type_BINARY_VIEW = _Type_BINARY_VIEW
110+
Type_STRING_VIEW = _Type_STRING_VIEW
109111
Type_LIST = _Type_LIST
110112
Type_LARGE_LIST = _Type_LARGE_LIST
111113
Type_MAP = _Type_MAP

python/pyarrow/scalar.pxi

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar):
665665
pass
666666

667667

668+
cdef class BinaryViewScalar(BinaryScalar):
669+
pass
670+
671+
672+
cdef class StringViewScalar(StringScalar):
673+
pass
674+
675+
668676
cdef class ListScalar(Scalar):
669677
"""
670678
Concrete class for list-like scalars.
@@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = {
10511059
_Type_BINARY: BinaryScalar,
10521060
_Type_LARGE_BINARY: LargeBinaryScalar,
10531061
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar,
1062+
_Type_BINARY_VIEW: BinaryViewScalar,
10541063
_Type_STRING: StringScalar,
10551064
_Type_LARGE_STRING: LargeStringScalar,
1065+
_Type_STRING_VIEW: StringViewScalar,
10561066
_Type_LIST: ListScalar,
10571067
_Type_LARGE_LIST: LargeListScalar,
10581068
_Type_FIXED_SIZE_LIST: FixedSizeListScalar,

python/pyarrow/src/arrow/python/helpers.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
6363
GET_PRIMITIVE_TYPE(STRING, utf8);
6464
GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
6565
GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
66+
GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
67+
GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
6668
GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
6769
default:
6870
return nullptr;

0 commit comments

Comments
 (0)