-
Notifications
You must be signed in to change notification settings - Fork 3k
Support pyarrow large_list #7019
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 77 commits
0545de4
4f23eb0
c870450
427f117
69f3548
9fdec4d
84f3014
9ea8eaf
df13687
9bc5182
6345fdc
0d997cd
d1bd580
87bd7e3
a772762
882d363
78b3a8f
300a5a9
cd0901c
a2c7bd0
d0e114c
1f9f594
a4eb288
9020ccf
057d184
8f3b02c
f6e528f
89d4366
eaf4c64
1f28c5f
6f3604c
33a1a55
6e6e9b7
bfa8fae
8215a61
152d6dd
632d1ea
1f247bc
f08f216
a79e337
b76aaa0
a677143
31d22dd
79772a6
af22e52
0611fdc
a1eff5c
e72d8fe
bf646ac
78a9a78
968364c
60465af
77aa27f
19e9deb
b27a8a1
9ec883b
ab8724b
30ba3bc
b1a3db7
b2a5789
7c39b51
48d143c
3968181
8e94ca0
40622e5
1dea864
c3bacba
c055ff3
823a049
45326a9
9acf8d9
5c8646b
f11c56d
27d0f94
4821c24
41f6068
bb6baf5
431694f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -1884,7 +1884,7 @@ def array_cast( | |||||||||||||||
| return array | ||||||||||||||||
| arrays = [_c(array.field(field.name), field.type) for field in pa_type] | ||||||||||||||||
| return pa.StructArray.from_arrays(arrays, fields=list(pa_type), mask=array.is_null()) | ||||||||||||||||
| elif pa.types.is_list(array.type): | ||||||||||||||||
| elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): | ||||||||||||||||
| if pa.types.is_fixed_size_list(pa_type): | ||||||||||||||||
| if _are_list_values_of_length(array, pa_type.list_size): | ||||||||||||||||
| if array.null_count > 0: | ||||||||||||||||
|
|
@@ -1911,6 +1911,10 @@ def array_cast( | |||||||||||||||
| # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError | ||||||||||||||||
| array_offsets = _combine_list_array_offsets_with_mask(array) | ||||||||||||||||
| return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type)) | ||||||||||||||||
| elif pa.types.is_large_list(pa_type): | ||||||||||||||||
| # Merge offsets with the null bitmap to avoid the "Null bitmap with offsets slice not supported" ArrowNotImplementedError | ||||||||||||||||
| array_offsets = _combine_list_array_offsets_with_mask(array) | ||||||||||||||||
| return pa.LargeListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type)) | ||||||||||||||||
| elif pa.types.is_fixed_size_list(array.type): | ||||||||||||||||
| if pa.types.is_fixed_size_list(pa_type): | ||||||||||||||||
| if pa_type.list_size == array.type.list_size: | ||||||||||||||||
|
|
@@ -1923,6 +1927,11 @@ def array_cast( | |||||||||||||||
| elif pa.types.is_list(pa_type): | ||||||||||||||||
| array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size | ||||||||||||||||
| return pa.ListArray.from_arrays(array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null()) | ||||||||||||||||
| elif pa.types.is_large_list(pa_type): | ||||||||||||||||
| array_offsets = (np.arange(len(array) + 1) + array.offset) * array.type.list_size | ||||||||||||||||
| return pa.LargeListArray.from_arrays( | ||||||||||||||||
| array_offsets, _c(array.values, pa_type.value_type), mask=array.is_null() | ||||||||||||||||
| ) | ||||||||||||||||
| else: | ||||||||||||||||
| if pa.types.is_string(pa_type): | ||||||||||||||||
| if not allow_primitive_to_str and pa.types.is_primitive(array.type): | ||||||||||||||||
|
|
@@ -1972,7 +1981,7 @@ def cast_array_to_feature( | |||||||||||||||
| Returns: | ||||||||||||||||
| array (`pyarrow.Array`): the casted array | ||||||||||||||||
| """ | ||||||||||||||||
| from .features.features import Sequence, get_nested_type | ||||||||||||||||
| from .features.features import LargeList, Sequence, get_nested_type | ||||||||||||||||
|
|
||||||||||||||||
| _c = partial( | ||||||||||||||||
| cast_array_to_feature, | ||||||||||||||||
|
|
@@ -1988,24 +1997,34 @@ def cast_array_to_feature( | |||||||||||||||
| elif pa.types.is_struct(array.type): | ||||||||||||||||
| # feature must be a dict or Sequence(subfeatures_dict) | ||||||||||||||||
| if isinstance(feature, Sequence) and isinstance(feature.feature, dict): | ||||||||||||||||
| feature = { | ||||||||||||||||
| name: Sequence(subfeature, length=feature.length) for name, subfeature in feature.feature.items() | ||||||||||||||||
| } | ||||||||||||||||
| sequence_kwargs = vars(feature).copy() | ||||||||||||||||
| feature = sequence_kwargs.pop("feature") | ||||||||||||||||
| feature = {name: Sequence(subfeature, **sequence_kwargs) for name, subfeature in feature.items()} | ||||||||||||||||
| if isinstance(feature, dict) and {field.name for field in array.type} == set(feature): | ||||||||||||||||
| if array.type.num_fields == 0: | ||||||||||||||||
| return array | ||||||||||||||||
| arrays = [_c(array.field(name), subfeature) for name, subfeature in feature.items()] | ||||||||||||||||
| return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null()) | ||||||||||||||||
| elif pa.types.is_list(array.type): | ||||||||||||||||
| elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): | ||||||||||||||||
| # feature must be either [subfeature] or Sequence(subfeature) | ||||||||||||||||
|
||||||||||||||||
| elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): | |
| # feature must be either [subfeature] or Sequence(subfeature) | |
| elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): | |
| # feature must be either [subfeature] or LargeList(subfeature) or Sequence(subfeature) |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe simpler ?
| if type(array.type) is type(get_nested_type(feature)) and casted_array_values.type == array.values.type: | |
| # Both array and feature have equal: list type and values (within the list) types | |
| if pa.types.is_list(array.type) and casted_array_values.type == array.values.type: | |
| # Both array and feature have equal: list type and values (within the list) types |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same
| if type(array.type) is type(get_nested_type(feature)) and casted_array_values.type == array.values.type: | |
| # Both array and feature have equal: list type and values (within the list) types | |
| return array | |
| if pa.types.is_large_list(array.type) and casted_array_values.type == array.values.type: | |
| # Both array and feature have equal: large list type and values (within the list) types | |
| return array |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same
| if ( | |
| type(array.type) is type(get_nested_type(feature)) | |
| and casted_array_values.type == array.values.type | |
| ): | |
| # Both array and feature have equal: list type and values (within the list) types | |
| if pa.types.is_list(array.type) and casted_array_values.type == array.values.type: | |
| # Both array and feature have equal: list type and values (within the list) types |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # feature must be either LargeList(subfeature) | |
| # feature must be LargeList(subfeature) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
those changes are not necessary but I'm fine with keeping them
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I made them when implementing Sequence.large and decided to keep them for robustness in case we add some other attribute to Sequence in the future.