Conversation
Codecov Report❌ Patch coverage is Additional details and impacted files@@ Coverage Diff @@
## main #1009 +/- ##
==========================================
+ Coverage 66.68% 68.45% +1.77%
==========================================
Files 186 195 +9
Lines 66795 68055 +1260
Branches 9507 9705 +198
==========================================
+ Hits 44543 46589 +2046
+ Misses 19572 18603 -969
- Partials 2680 2863 +183 |
bodo/hiframes/boxing.py
Outdated
| arr_type = bodo.types.boolean_array_type | ||
|
|
||
| if arr_type == types.Array(types.NPDatetime("us"), 1, "C"): | ||
| # Make sure datetime64 arrays are ns |
There was a problem hiding this comment.
How is this making sure they are ns?
There was a problem hiding this comment.
DatetimeArrayType normalizes the unit to nanosecond during unboxing (through Arrow).
bodo/hiframes/boxing.py
Outdated
| arr_type = bodo.types.DatetimeArrayType(None) | ||
|
|
||
| # Make sure timedelta64 arrays are ns | ||
| if isinstance(arr_type, types.Array) and isinstance( |
There was a problem hiding this comment.
This is a specific case to timedelta.
|
|
||
| # We make all Series data arrays contiguous during unboxing to avoid type errors | ||
| # see test_df_query_stringliteral_expr | ||
| if isinstance(arr_type, types.Array): |
There was a problem hiding this comment.
Maybe one check for isinstance(arr_type, types.Array) and inside that if the additional checks above with an else for this current category. Eliminate 3 checks for isinstance?
There was a problem hiding this comment.
Done. Refactored this section to have only one types.Array type check.
| normalize=False, | ||
| name=None, | ||
| closed=None, | ||
| inclusive="both", |
There was a problem hiding this comment.
So, Pandas 3 only once this is merged?
There was a problem hiding this comment.
Yes, we can't keep many versions of different APIs practically. These minor differences shouldn't matter much anyways.
bodo/hiframes/pd_rolling_ext.py
Outdated
| not isinstance(on_data_type, types.Array) | ||
| or on_data_type.dtype != bodo.types.datetime64ns | ||
| ): | ||
| ) and not on_data_type == bodo.types.DatetimeArrayType(None): |
| n_int64 = bodo.hiframes.datetime_timedelta_ext.cast_numpy_timedelta_to_int(dt64) | ||
| return pd.Timedelta(n_int64) | ||
| def convert_numpy_timedelta64_to_pd_timedelta(td64): # pragma: no cover | ||
| return td64 |
There was a problem hiding this comment.
Only needs conversion in jitted code?
There was a problem hiding this comment.
Not called in non-jitted code. This is just a placeholder.
There was a problem hiding this comment.
Should throw an exception then?
There was a problem hiding this comment.
Probably, we have a lot of these in the code base. Not worth going through them right now I think.
bodo/hiframes/series_dt_impl.py
Outdated
| func_text += " out_arr[i] = ts." + field + "\n" | ||
| else: | ||
| func_text += f" out_arr[i] = arr[i].{field}\n" | ||
| call_parans = "()" if field == "weekday" else "" |
| func_text += " min_val = bodo.libs.array_ops.array_op_min(arr)\n" | ||
| func_text += " max_val = bodo.libs.array_ops.array_op_max(arr)\n" | ||
| if dtype == bodo.types.datetime64ns: | ||
| if dtype == bodo.types.datetime64ns or isinstance( |
There was a problem hiding this comment.
one isinstance with two possible targets?
There was a problem hiding this comment.
There is a line of code in between.
| if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( | ||
| pa.types.is_string(values.type) or _is_string_view(values.type) | ||
| # Bodo change: allow dictionary-encoded string arrays | ||
| # or ( |
There was a problem hiding this comment.
Keeping the disabled code around for documentation and context to help with later upgrades.
| @@ -239,6 +246,7 @@ nccl = ">=2.18" | |||
| numba = ">=0.60,<0.62.0" | |||
| pyarrow = "21.0.*" | |||
| libarrow = "21.0.*" | |||
| pandas = ">=2.2.0" | |||
There was a problem hiding this comment.
What are the implications of 2.2 here?
There was a problem hiding this comment.
I think it's for compatibility with older pyarrow dependency in the GPU env?
There was a problem hiding this comment.
This is just moved here in this PR.
scott-routledge2
left a comment
There was a problem hiding this comment.
Thanks @ehsantn LGTM!
| assert generated_ctes == 1 | ||
|
|
||
|
|
||
| @pytest.mark.jit_dependency |
There was a problem hiding this comment.
Why does this test require JIT?
There was a problem hiding this comment.
It has apply in it. I don't know how it worked before.
|
|
||
|
|
||
| @pytest.mark.parametrize("filter", ["IS_NULL", "IS_NOT_NULL", "IS_IN"]) | ||
| # TODO: fix Pandas 3 issues with IS_NULL and IS_NOT_NULL |
There was a problem hiding this comment.
Open a followup issue?
| def test_pq_read_types(fname, datapath, memory_leak_check): | ||
| def test_impl(fname): | ||
| return pd.read_parquet(fname) | ||
| return pd.read_parquet(fname, dtype_backend="pyarrow") |
There was a problem hiding this comment.
Do we need to update our docs/examples to reflect changes to parameters like requiring dtype_backend="pyarrow" in read csv/parquet calls?
There was a problem hiding this comment.
We don't require this parameter. This is just for testing to make sure data types match and we don't run into unnecessary issues.
bodo/tests/utils.py
Outdated
| @@ -1641,10 +1767,12 @@ def _test_equal( | |||
| reset_index, | |||
| ) | |||
| elif py_out is pd.NaT: | |||
| assert py_out is bodo_out | |||
| # TODO: return pd.NaT for pd.to_datetime(None) and pd.to_timedelta(ts_val) | |||
| @@ -239,6 +246,7 @@ nccl = ">=2.18" | |||
| numba = ">=0.60,<0.62.0" | |||
| pyarrow = "21.0.*" | |||
| libarrow = "21.0.*" | |||
| pandas = ">=2.2.0" | |||
There was a problem hiding this comment.
I think it's for compatibility with older pyarrow dependency in the GPU env?
Changes included in this PR
As title. Major changes include:
Testing strategy
Existing unit tests.
User facing changes
None.
Checklist
[run CI]in your commit message.