diff --git a/marimo/_plugins/ui/_impl/charts/altair_transformer.py b/marimo/_plugins/ui/_impl/charts/altair_transformer.py index 0fc3eb4944f..690a7287fd1 100644 --- a/marimo/_plugins/ui/_impl/charts/altair_transformer.py +++ b/marimo/_plugins/ui/_impl/charts/altair_transformer.py @@ -154,7 +154,7 @@ def sanitize_nan_infs(data: Any) -> Any: """Sanitize NaN and Inf values in Dataframes for JSON serialization.""" if can_narwhalify(data): narwhals_data = nw.from_native(data) - is_prev_lazy = isinstance(narwhals_data, nw.LazyFrame) + is_prev_lazy = is_narwhals_lazyframe(narwhals_data) # Convert to lazy for optimization if not already lazy if not is_prev_lazy: diff --git a/marimo/_plugins/ui/_impl/table.py b/marimo/_plugins/ui/_impl/table.py index f6661be4ffd..89fac1360ee 100644 --- a/marimo/_plugins/ui/_impl/table.py +++ b/marimo/_plugins/ui/_impl/table.py @@ -952,24 +952,28 @@ def _get_column_summaries( ) # For boolean columns, we can drop the column since we use stats - column_type = self._manager.get_field_type(column) - if column_type[0] == "boolean": + (column_type, external_type) = self._manager.get_field_type( + column + ) + if column_type == "boolean": data = data.drop_columns([column]) - # Bin values are only supported for numeric and temporal columns - if column_type[0] not in [ - "integer", - "number", - "date", - "datetime", - "time", - "string", - ]: - continue + # Handle columns with all nulls first + # These get empty bins regardless of type + if statistic and statistic.nulls == total_rows: + try: + bin_values[column] = [] + data = data.drop_columns([column]) + continue + except BaseException as e: + LOGGER.warning( + "Failed to drop all-null column %s: %s", column, e + ) + continue # For perf, we only compute value counts for categorical columns - external_type = column_type[1].lower() - if column_type[0] == "string" and ( + external_type = external_type.lower() + if column_type == "string" and ( "cat" in external_type or "enum" in external_type ): try: @@ -987,13 +991,22 @@ def _get_column_summaries( e, ) + # Bin values are only supported for numeric and temporal columns + if column_type not in [ + "integer", + "number", + "date", + "datetime", + "time", + ]: + continue + try: - if statistic and statistic.nulls == total_rows: - bins = [] - else: - bins = data.get_bin_values(column, DEFAULT_BIN_SIZE) + bins = data.get_bin_values(column, DEFAULT_BIN_SIZE) bin_values[column] = bins - data = data.drop_columns([column]) + # Only drop column if we got bins to visualize + if len(bins) > 0: + data = data.drop_columns([column]) continue except BaseException as e: LOGGER.warning( diff --git a/marimo/_plugins/ui/_impl/tables/ibis_table.py b/marimo/_plugins/ui/_impl/tables/ibis_table.py index b52399e9863..2fab52bcbfc 100644 --- a/marimo/_plugins/ui/_impl/tables/ibis_table.py +++ b/marimo/_plugins/ui/_impl/tables/ibis_table.py @@ -3,30 +3,19 @@ import datetime import functools -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Union + +import narwhals.stable.v2 as nw from marimo import _loggers -from marimo._data.models import BinValue, ColumnStats, ExternalDataType -from marimo._dependencies.dependencies import DependencyManager -from marimo._plugins.ui._impl.tables.format import ( - FormatMapping, -) -from marimo._plugins.ui._impl.tables.pandas_table import ( - PandasTableManagerFactory, -) -from marimo._plugins.ui._impl.tables.polars_table import ( - PolarsTableManagerFactory, -) +from marimo._data.models import BinValue, ExternalDataType +from marimo._plugins.ui._impl.tables.narwhals_table import NarwhalsTableManager from marimo._plugins.ui._impl.tables.table_manager import ( ColumnName, FieldType, - FieldTypes, - TableCell, - TableCoordinate, TableManager, TableManagerFactory, ) -from marimo._utils.memoize import memoize_last_value LOGGER = _loggers.marimo_logger() @@ -44,127 +33,24 @@ def package_name() -> str: def create() -> type[TableManager[Any]]: import ibis # type: ignore - class IbisTableManager(TableManager[ibis.Table]): + class IbisTableManager(NarwhalsTableManager[ibis.Table, ibis.Table]): type = "ibis" - def to_csv_str( - self, format_mapping: Optional[FormatMapping] = None - ) -> str: - return self._as_table_manager().to_csv_str(format_mapping) - - def to_json_str( - self, format_mapping: Optional[FormatMapping] = None - ) -> str: - return self._as_table_manager().to_json_str(format_mapping) - - def to_parquet(self) -> bytes: - return self._as_table_manager().to_parquet() - - def supports_download(self) -> bool: - return False - - def apply_formatting( - self, format_mapping: Optional[FormatMapping] - ) -> IbisTableManager: - raise NotImplementedError("Column formatting not supported") - - def supports_filters(self) -> bool: - return True - - def select_rows( - self, indices: list[int] - ) -> TableManager[ibis.Table]: - if not indices: - return self.take(0, 0) # Return empty table - # Select rows using Ibis API - return IbisTableManager( - self.data.filter(ibis.row_number().over().isin(indices)) - ) + def __init__(self, data: ibis.Table) -> None: + self._original_data = data + super().__init__(nw.from_native(data)) - def select_columns( - self, columns: list[str] - ) -> TableManager[ibis.Table]: - return IbisTableManager(self.data.select(columns)) - - def select_cells( - self, cells: list[TableCoordinate] - ) -> list[TableCell]: - del cells - raise NotImplementedError("Cell selection not supported") - - def drop_columns( - self, columns: list[str] - ) -> TableManager[ibis.Table]: - return IbisTableManager(self.data.drop(columns)) - - def get_row_headers(self) -> FieldTypes: - return [] + def collect(self) -> ibis.Table: + return self._original_data @staticmethod def is_type(value: Any) -> bool: return isinstance(value, ibis.Table) - def take(self, count: int, offset: int) -> IbisTableManager: - if count < 0: - raise ValueError("Count must be a positive integer") - if offset < 0: - raise ValueError("Offset must be a non-negative integer") - return IbisTableManager(self.data.limit(count, offset=offset)) - - def search(self, query: str) -> TableManager[Any]: - query = query.lower() - predicates = [] - for column in self.data.columns: - col = self.data[column] - if col.type().is_string(): - predicates.append(col.lower().rlike(query)) - elif col.type().is_numeric(): - predicates.append( - col.cast("string").lower().contains(query) - ) - elif col.type().is_boolean(): - predicates.append( - col.cast("string").lower().contains(query) - ) - elif col.type().is_timestamp(): - predicates.append( - col.cast("string").lower().contains(query) - ) - elif col.type().is_date(): - predicates.append( - col.cast("string").lower().contains(query) - ) - elif col.type().is_time(): - predicates.append( - col.cast("string").lower().contains(query) - ) - - if predicates: - filtered = self.data.filter(ibis.or_(*predicates)) - else: - filtered = self.data.filter(ibis.literal(False)) - - return IbisTableManager(filtered) - - def get_stats(self, column: str) -> ColumnStats: - col = self.data[column] - total = self.data.count().execute() - nulls = col.isnull().sum().execute() - - stats = ColumnStats(total=total, nulls=nulls) - - if col.type().is_numeric(): - stats.min = col.min().execute() - stats.max = col.max().execute() - stats.mean = col.mean().execute() - stats.median = col.median().execute() - stats.std = col.std().execute() - - return stats - def _get_numeric_bin_values( self, col: ibis.Column, num_bins: int ) -> ibis.Table: + data = self._original_data min_val = col.min().execute() max_val = col.max().execute() @@ -184,7 +70,7 @@ def _get_numeric_bin_values( bin_width = (max_val - min_val) / num_bins # Assign bins and count occurrences - data = self.data.mutate(bin=col.histogram(nbins=num_bins)) + data = data.mutate(bin=col.histogram(nbins=num_bins)) value_counts = data["bin"].value_counts(name="count") # Fill in missing bins @@ -217,11 +103,12 @@ def get_bin_values( Returns: list[BinValue]: The bin values. """ - if column not in self.data.columns: + data = self._original_data + if column not in data.columns: LOGGER.error(f"Column {column} not found in Ibis table") return [] - col = self.data[column] + col = data[column] dtype = col.type() if dtype.is_temporal(): @@ -246,6 +133,8 @@ def get_bin_values( def _get_bin_values_temporal( self, column: ColumnName, dtype: DataType, num_bins: int ) -> list[BinValue]: + data = self._original_data + def _convert_ms_to_time(ms: int) -> datetime.time: hours = ms // 3600000 minutes = (ms % 3600000) // 60000 @@ -253,7 +142,7 @@ def _convert_ms_to_time(ms: int) -> datetime.time: microseconds = (ms % 1000) * 1000 return datetime.time(hours, minutes, seconds, microseconds) - col = self.data[column] + col = data[column] if dtype.is_time(): col_agg = ( @@ -301,65 +190,10 @@ def _convert_ms_to_time(ms: int) -> datetime.time: return bin_values - @memoize_last_value - def get_num_rows(self, force: bool = True) -> Optional[int]: - if force: - return self.data.count().execute() # type: ignore - return None - - def get_num_columns(self) -> int: - return len(self.data.columns) - - def get_column_names(self) -> list[str]: - return self.data.columns # type: ignore - - def get_unique_column_values( - self, column: str - ) -> list[str | int | float]: - result = ( - self.data.distinct(on=column) - .select(column) - .execute()[column] - .tolist() - ) - return result # type: ignore - - def get_sample_values(self, column: str) -> list[Any]: - # Don't sample values for Ibis tables - # since it can be expensive - del column - return [] - - def sort_values( - self, by: ColumnName, descending: bool - ) -> IbisTableManager: - sorted_data = self.data.order_by( - ibis.desc(by) if descending else ibis.asc(by) - ) - return IbisTableManager(sorted_data) - - @functools.lru_cache(maxsize=5) # noqa: B019 - def calculate_top_k_rows( - self, column: ColumnName, k: int - ) -> list[tuple[Any, int]]: - count_col_name = f"{column}_count" - result = ( - self.data[[column]] - .value_counts(name=count_col_name) - .order_by(ibis.desc(count_col_name)) - .limit(k) - .execute() - ) - - return [ - (row[0], int(row[1])) - for row in result.itertuples(index=False) - ] - def get_field_type( self, column_name: str ) -> tuple[FieldType, ExternalDataType]: - column = self.data[column_name] + column = self._original_data[column_name] dtype = column.type() if dtype.is_string(): return ("string", str(dtype)) @@ -378,18 +212,4 @@ def get_field_type( else: return ("unknown", str(dtype)) - def _as_table_manager(self) -> TableManager[Any]: - if DependencyManager.pandas.has(): - return PandasTableManagerFactory.create()( - self.data.to_pandas() - ) - if DependencyManager.polars.has(): - return PolarsTableManagerFactory.create()( - self.data.to_polars() - ) - - raise ValueError( - "Requires at least one of pandas, polars, or pyarrow" - ) - return IbisTableManager diff --git a/marimo/_plugins/ui/_impl/tables/narwhals_table.py b/marimo/_plugins/ui/_impl/tables/narwhals_table.py index 7b66bcfb473..348df8bdccb 100644 --- a/marimo/_plugins/ui/_impl/tables/narwhals_table.py +++ b/marimo/_plugins/ui/_impl/tables/narwhals_table.py @@ -188,7 +188,7 @@ def _calculate_top_k_rows( ) .head(k) ) - if isinstance(result, nw.LazyFrame): + if is_narwhals_lazyframe(result): return result.collect() return result @@ -342,6 +342,13 @@ def _get_stats_internal(self, column: str) -> ColumnStats: "nulls": col.null_count(), } + # As of Sep 2025, pyarrow and ibis do not support quantiles + # through narwhals + supports_quantiles = ( + not frame.implementation.is_pyarrow() + and not frame.implementation.is_ibis() + ) + if is_narwhals_string_type(dtype): exprs["unique"] = col.n_unique() elif dtype == nw.Boolean: @@ -394,8 +401,7 @@ def _get_stats_internal(self, column: str) -> ColumnStats: "max": col.max(), } ) - # Arrow does not support mean or quantile - if not frame.implementation.is_pyarrow(): + if supports_quantiles: exprs.update( { "mean": col.mean(), @@ -413,28 +419,39 @@ def _get_stats_internal(self, column: str) -> ColumnStats: "min": col.min(), "max": col.max(), "mean": col.mean(), - "median": col.quantile(0.5, interpolation="nearest"), "std": col.std(), - "p5": col.quantile(0.05, interpolation="nearest"), - "p25": col.quantile(0.25, interpolation="nearest"), - "p75": col.quantile(0.75, interpolation="nearest"), - "p95": col.quantile(0.95, interpolation="nearest"), + "median": col.median(), } ) + if supports_quantiles: + exprs.update( + { + "p5": col.quantile(0.05, interpolation="nearest"), + "p25": col.quantile(0.25, interpolation="nearest"), + "p75": col.quantile(0.75, interpolation="nearest"), + "p95": col.quantile(0.95, interpolation="nearest"), + } + ) elif dtype.is_numeric(): exprs.update( { + "unique": col.n_unique(), "min": col.min(), "max": col.max(), "mean": col.mean(), - "median": col.quantile(0.5, interpolation="nearest"), "std": col.std(), - "p5": col.quantile(0.05, interpolation="nearest"), - "p25": col.quantile(0.25, interpolation="nearest"), - "p75": col.quantile(0.75, interpolation="nearest"), - "p95": col.quantile(0.95, interpolation="nearest"), + "median": col.median(), } ) + if supports_quantiles: + exprs.update( + { + "p5": col.quantile(0.05, interpolation="nearest"), + "p25": col.quantile(0.25, interpolation="nearest"), + "p75": col.quantile(0.75, interpolation="nearest"), + "p95": col.quantile(0.95, interpolation="nearest"), + } + ) stats = frame.select(**exprs) stats_dict = stats.collect().rows(named=True)[0] @@ -573,7 +590,7 @@ def get_column_names(self) -> list[str]: def get_unique_column_values(self, column: str) -> list[str | int | float]: frame = self.data.select(nw.col(column)) - if isinstance(frame, nw.LazyFrame): + if is_narwhals_lazyframe(frame): frame = frame.collect() try: return frame[column].unique().to_list() diff --git a/marimo/_smoke_tests/ibis_example.py b/marimo/_smoke_tests/ibis_example.py index 70d65bec303..741508b8d0c 100644 --- a/marimo/_smoke_tests/ibis_example.py +++ b/marimo/_smoke_tests/ibis_example.py @@ -7,7 +7,7 @@ import marimo -__generated_with = "0.15.5" +__generated_with = "0.16.3" app = marimo.App(width="medium") @@ -29,8 +29,14 @@ def _(ibis): @app.cell -def _(df): +def _(df, mo): # Print Ibis data in a pretty table + mo.ui.table(df) + return + + +@app.cell +def _(df): df.to_polars() return diff --git a/tests/_plugins/ui/_impl/tables/snapshots/ibis.csv b/tests/_plugins/ui/_impl/tables/snapshots/ibis.csv index c63db10fe3f..d7e9a86219a 100644 --- a/tests/_plugins/ui/_impl/tables/snapshots/ibis.csv +++ b/tests/_plugins/ui/_impl/tables/snapshots/ibis.csv @@ -1,4 +1,4 @@ -strings,bool,int,float,date,datetime,time,nulls -a,True,1,1.0,2021-01-01 00:00:00,2021-01-01 00:00:00,01:02:03, -b,False,2,2.0,2021-01-02 00:00:00,2021-01-02 00:00:00,04:05:06,data -c,True,3,3.0,2021-01-03 00:00:00,2021-01-03 00:00:00,07:08:09, +"strings","bool","int","float","date","datetime","time","nulls" +"a",true,1,1,2021-01-01,2021-01-01 00:00:00.000000,01:02:03.000000000, +"b",false,2,2,2021-01-02,2021-01-02 00:00:00.000000,04:05:06.000000000,"data" +"c",true,3,3,2021-01-03,2021-01-03 00:00:00.000000,07:08:09.000000000, diff --git a/tests/_plugins/ui/_impl/tables/snapshots/ibis.json b/tests/_plugins/ui/_impl/tables/snapshots/ibis.json index 14aeb7cfb35..06fdcb64ef0 100644 --- a/tests/_plugins/ui/_impl/tables/snapshots/ibis.json +++ b/tests/_plugins/ui/_impl/tables/snapshots/ibis.json @@ -4,8 +4,8 @@ "bool": true, "int": 1, "float": 1.0, - "date": "2021-01-01T00:00:00.000", - "datetime": "2021-01-01T00:00:00.000", + "date": "2021-01-01", + "datetime": "2021-01-01 00:00:00", "time": "01:02:03", "nulls": null }, @@ -14,8 +14,8 @@ "bool": false, "int": 2, "float": 2.0, - "date": "2021-01-02T00:00:00.000", - "datetime": "2021-01-02T00:00:00.000", + "date": "2021-01-02", + "datetime": "2021-01-02 00:00:00", "time": "04:05:06", "nulls": "data" }, @@ -24,8 +24,8 @@ "bool": true, "int": 3, "float": 3.0, - "date": "2021-01-03T00:00:00.000", - "datetime": "2021-01-03T00:00:00.000", + "date": "2021-01-03", + "datetime": "2021-01-03 00:00:00", "time": "07:08:09", "nulls": null } diff --git a/tests/_plugins/ui/_impl/tables/test_ibis_table.py b/tests/_plugins/ui/_impl/tables/test_ibis_table.py index 03b5002cdc7..4f0e697522d 100644 --- a/tests/_plugins/ui/_impl/tables/test_ibis_table.py +++ b/tests/_plugins/ui/_impl/tables/test_ibis_table.py @@ -9,6 +9,7 @@ from marimo._data.models import BinValue, ColumnStats from marimo._dependencies.dependencies import DependencyManager +from marimo._plugins.ui._impl.tables.format import FormatMapping from marimo._plugins.ui._impl.tables.ibis_table import ( IbisTableManagerFactory, ) @@ -77,28 +78,28 @@ def test_to_csv(self) -> None: assert isinstance(self.manager.to_csv(), bytes) complex_data = self.get_complex_data() - data = complex_data.to_csv() - assert isinstance(data, bytes) - snapshot("ibis.csv", data.decode("utf-8")) + data = complex_data.to_csv_str() + assert isinstance(data, str) + snapshot("ibis.csv", data) def test_to_json(self) -> None: assert isinstance(self.manager.to_json(), bytes) complex_data = self.get_complex_data() - data = complex_data.to_json() - assert isinstance(data, bytes) - snapshot("ibis.json", data.decode("utf-8")) + data = complex_data.to_json_str() + assert isinstance(data, str) + snapshot("ibis.json", data) def test_to_json_format_mapping(self) -> None: import ibis - table = ibis.memtable({"int": [1, 2, 3]}) + table = ibis.memtable({"int": [1, 2, 3]}, schema={"int": "int64"}) data = self.factory.create()(table) - format_mapping = {"int": lambda x: x * 2} - json_data = data.to_json(format_mapping) + format_mapping: FormatMapping = {"int": lambda x: x * 2} + json_data = data.to_json_str(format_mapping) - json_object = json.loads(json_data.decode("utf-8")) + json_object = json.loads(json_data) assert json_object == [{"int": 2}, {"int": 4}, {"int": 6}] def test_complex_data_field_types(self) -> None: @@ -107,30 +108,30 @@ def test_complex_data_field_types(self) -> None: snapshot("ibis.field_types.json", json.dumps(field_types)) def test_select_rows(self) -> None: - import ibis - indices = [0, 2] selected_manager = self.manager.select_rows(indices) - expected_data = self.data.filter(ibis.row_number().isin(indices)) - assert selected_manager.data.to_pandas().equals( - expected_data.to_pandas() - ) + assert selected_manager.data.to_dict(as_series=False) == { + "A": [1, 3], + "B": ["a", "c"], + "C": [1.0, 3.0], + "D": [True, True], + "E": [ + datetime.datetime(2021, 1, 1), + datetime.datetime(2021, 1, 3), + ], + } def test_select_columns(self) -> None: columns = ["A"] selected_manager = self.manager.select_columns(columns) - expected_data = self.data.select(columns) - assert selected_manager.data.to_pandas().equals( - expected_data.to_pandas() - ) + assert selected_manager.data.collect().to_dict(as_series=False) == { + "A": [1, 2, 3], + } def test_drop_columns(self) -> None: columns = ["A"] dropped_manager = self.manager.drop_columns(columns) - expected_data = self.data.drop(columns) - assert dropped_manager.data.to_pandas().equals( - expected_data.to_pandas() - ) + assert dropped_manager.data.columns == ["B", "C", "D", "E"] def test_get_row_headers(self) -> None: expected_headers = [] @@ -152,35 +153,39 @@ def test_get_field_types(self) -> None: def test_limit(self) -> None: limited_manager = self.manager.take(1, 0) - expected_data = self.data.limit(1) - assert limited_manager.data.to_pandas().equals( - expected_data.to_pandas() - ) + assert limited_manager.get_num_rows() == 1 def test_take(self) -> None: - assert ( - self.manager.take(1, 0).select_columns(["A"]).to_json() - == b'[{"A":1}]' - ) - assert ( - self.manager.take(2, 0).select_columns(["A"]).to_json() - == b'[{"A":1},{"A":2}]' - ) - assert ( - self.manager.take(2, 1).select_columns(["A"]).to_json() - == b'[{"A":2},{"A":3}]' - ) - assert ( - self.manager.take(2, 2).select_columns(["A"]).to_json() - == b'[{"A":3}]' - ) + def as_list(df: Any) -> list[Any]: + return df.to_dict(as_series=False)["A"] + + assert as_list(self.manager.take(1, 0).data.collect()) == [1] + assert as_list(self.manager.take(2, 0).data.collect()) == [ + 1, + 2, + ] + + assert as_list(self.manager.take(2, 1).select_columns(["A"]).data) == [ + 2, + 3, + ] + + assert as_list(self.manager.take(2, 2).select_columns(["A"]).data) == [ + 3, + ] def test_to_parquet(self) -> None: assert isinstance(self.manager.to_parquet(), bytes) def test_take_zero(self) -> None: limited_manager = self.manager.take(0, 0) - assert limited_manager.data.count().execute() == 0 + assert limited_manager.data.collect().to_dict(as_series=False) == { + "A": [], + "B": [], + "C": [], + "D": [], + "E": [], + } def test_take_negative(self) -> None: with pytest.raises(ValueError): @@ -192,11 +197,11 @@ def test_take_negative_offset(self) -> None: def test_take_out_of_bounds(self) -> None: # Too large of page - assert self.manager.take(10, 0).data.count().execute() == 3 - assert self.data.count().execute() == 3 + assert self.manager.take(10, 0).get_num_rows() == 3 + assert self.manager.get_num_rows() == 3 # Too large of page and offset - assert self.manager.take(10, 10).data.count().execute() == 0 + assert self.manager.take(10, 10).get_num_rows() == 0 def test_stats_integer(self) -> None: column = "A" @@ -208,6 +213,7 @@ def test_stats_integer(self) -> None: max=3, mean=2.0, median=2.0, + unique=3, std=1.0, ) @@ -217,14 +223,22 @@ def test_stats_string(self) -> None: assert stats == ColumnStats( total=3, nulls=0, + unique=3, ) def test_sort_values(self) -> None: - import ibis - sorted_manager = self.manager.sort_values("A", descending=True) - expected_df = self.data.order_by(ibis.desc("A")) - assert sorted_manager.data.to_pandas().equals(expected_df.to_pandas()) + assert sorted_manager.data.collect().to_dict(as_series=False) == { + "A": [3, 2, 1], + "B": ["c", "b", "a"], + "C": [3.0, 2.0, 1.0], + "D": [True, False, True], + "E": [ + datetime.datetime(2021, 1, 3), + datetime.datetime(2021, 1, 2), + datetime.datetime(2021, 1, 1), + ], + } def test_get_unique_column_values(self) -> None: column = "A" @@ -251,14 +265,7 @@ def test_search(self) -> None: assert manager.search("true").get_num_rows() == 2 assert manager.search("food").get_num_rows() == 0 - @pytest.mark.xfail( - reason="column formatting is not supported in ibis", - ) def test_apply_formatting(self) -> None: - import ibis - - from marimo._plugins.ui._impl.tables.format import FormatMapping - format_mapping: FormatMapping = { "A": lambda x: x * 2, "B": lambda x: x.upper(), @@ -268,16 +275,13 @@ def test_apply_formatting(self) -> None: } formatted_data = self.manager.apply_formatting(format_mapping) - expected_data = ibis.memtable( - { - "A": [2, 4, 6], - "B": ["A", "B", "C"], - "C": ["1.00", "2.00", "3.00"], - "D": [False, True, False], - "E": ["2021-01-01", "2021-01-02", "2021-01-03"], - } - ) - assert formatted_data.to_pandas().equals(expected_data.to_pandas()) + assert formatted_data.data.to_dict(as_series=False) == { + "A": [2, 4, 6], + "B": ["A", "B", "C"], + "C": ["1.00", "2.00", "3.00"], + "D": [False, True, False], + "E": ["2021-01-01", "2021-01-02", "2021-01-03"], + } def test_empty_table(self) -> None: import ibis @@ -312,30 +316,33 @@ def test_search_with_regex(self) -> None: def test_sort_values_with_nulls(self) -> None: import ibis - import numpy as np table = ibis.memtable({"A": [3, 1, None, 2]}) manager = self.factory.create()(table) # Descending true sorted_manager = manager.sort_values("A", descending=True) - sorted_data = sorted_manager.data.to_pandas()["A"].tolist() - assert sorted_data[0:3] == [ + sorted_data = sorted_manager.data.collect().to_dict(as_series=False)[ + "A" + ] + assert sorted_data == [ 3.0, 2.0, 1.0, + None, ] - assert np.isnan(sorted_data[3]) # Descending false sorted_manager = manager.sort_values("A", descending=False) - sorted_data = sorted_manager.data.to_pandas()["A"].tolist() - assert sorted_data[0:3] == [ + sorted_data = sorted_manager.data.collect().to_dict(as_series=False)[ + "A" + ] + assert sorted_data == [ 1.0, 2.0, 3.0, + None, ] - assert np.isnan(sorted_data[3]) def test_calculate_top_k_rows(self) -> None: import ibis @@ -346,42 +353,37 @@ def test_calculate_top_k_rows(self) -> None: assert result == [(3, 2), (2, 1)] # Test equal counts with k limit - table = ibis.memtable({"A": [1, 1, 2, 2, 3]}) + table = ibis.memtable({"A": [1, 1, 2, 2, 2, 3]}) manager = self.factory.create()(table) result = manager.calculate_top_k_rows("A", 2) - assert len(result) == 2 - assert {(1, 2), (2, 2)} == set(result) - assert all(count == 2 for _, count in result) + assert result == [(2, 3), (1, 2)] def test_calculate_top_k_rows_nulls(self) -> None: import ibis - import pandas as pd # Test single null value table = ibis.memtable({"A": [3, None, None]}) manager = self.factory.create()(table) result = manager.calculate_top_k_rows("A", 10) - assert len(result) == 2 - assert result[1] == (3, 1) - assert pd.isna(result[0][0]) - assert result[0][1] == 2 + assert result == [ + (None, 2), + (3, 1), + ] # Test all null values table = ibis.memtable({"A": [None, None, None]}) manager = self.factory.create()(table) result = manager.calculate_top_k_rows("A", 10) - assert len(result) == 1 - assert pd.isna(result[0][0]) - assert result[0][1] == 3 + assert result == [(None, 3)] # Test mixed values with nulls - table = ibis.memtable({"A": [1, None, 2, None, 3, None]}) + table = ibis.memtable({"A": [1, None, 2, 2, None, 3, None]}) manager = self.factory.create()(table) result = manager.calculate_top_k_rows("A", 10) assert len(result) == 4 - assert pd.isna(result[0][0]) + assert result[0][0] is None assert result[0][1] == 3 - assert set(result[1:]) == {(1, 1), (2, 1), (3, 1)} + assert set(result[1:]) == {(1, 1), (2, 2), (3, 1)} def test_calculate_top_k_rows_nested_lists(self) -> None: import ibis @@ -418,8 +420,8 @@ def test_get_bin_values(self) -> None: assert result == [ BinValue(bin_start=-1.0, bin_end=4.2, count=2), BinValue(bin_start=4.2, bin_end=9.4, count=3), - BinValue(bin_start=9.4, bin_end=14.6, count=2), - BinValue(bin_start=14.6, bin_end=19.8, count=0), + BinValue(bin_start=9.4, bin_end=pytest.approx(14.6), count=2), + BinValue(bin_start=pytest.approx(14.6), bin_end=19.8, count=0), BinValue(bin_start=19.8, bin_end=25.0, count=2), ] @@ -576,3 +578,5 @@ def test_timedelta_column(self) -> None: count=2, ) ] + result = self.manager.calculate_top_k_rows("A", 2) + assert result == [(None, 3), (2, 2)] diff --git a/tests/_plugins/ui/_impl/tables/test_narwhals.py b/tests/_plugins/ui/_impl/tables/test_narwhals.py index 4e11f6e7e79..6b9c91e32ae 100644 --- a/tests/_plugins/ui/_impl/tables/test_narwhals.py +++ b/tests/_plugins/ui/_impl/tables/test_narwhals.py @@ -41,8 +41,7 @@ SUPPORTED_LIBS: list[DFType] = [ "pandas", "polars", - # TODO: Either we can import narwhals `main` or wait for v0.1.0 - # "ibis", + "ibis", "lazy-polars", "pyarrow", ] @@ -362,6 +361,7 @@ def test_summary_number(self) -> None: mean=2.0, median=2.0, std=1.0, + unique=3, p5=1.0, p25=2.0, p75=3.0, @@ -459,6 +459,7 @@ def test_get_stats_unwraps_scalars_properly(self) -> None: max=5.5, mean=3.5, median=3.5, + unique=5, std=1.5811388300841898, p5=1.5, p25=2.5, @@ -867,7 +868,6 @@ def test_to_csv(df: Any) -> None: "B": ["a", "b", "c"], "C": [1.0, 2.0, 3.0], }, - exclude=["ibis", "duckdb"], ), ) def test_to_parquet(df: Any) -> None: @@ -922,7 +922,7 @@ def test_empty_dataframe(df: Any) -> None: @pytest.mark.parametrize( "df", create_dataframes( - {"A": [1, 2, 3], "B": [None, None, None]}, exclude=["ibis", "duckdb"] + {"A": [1, 2, 3], "B": [None, None, None]}, exclude=["duckdb"] ), ) def test_dataframe_with_all_null_column(df: Any) -> None: @@ -1002,7 +1002,6 @@ def test_get_summary_all_types() -> None: "string": ["a", "b", "b", "c", "d", "d", "e"], "boolean": [True, False, False, True, False, False, True], }, - exclude=["ibis", "duckdb"], strict=False, ), ) @@ -1093,7 +1092,7 @@ def _round_bin_values(bin_values: list[BinValue]) -> list[BinValue]: datetime.date(2021, 1, 1), ], }, - exclude=["ibis", "duckdb"], + exclude=["ibis"], ), ) class TestGetBinValuesTemporal: @@ -1216,9 +1215,7 @@ def test_dates_multiple(self, df: Any) -> None: @pytest.mark.skipif(not HAS_DEPS, reason="optional dependencies not installed") @pytest.mark.parametrize( "df", - create_dataframes( - {"A": ["apple", "banana", "cherry"]}, exclude=["ibis", "duckdb"] - ), + create_dataframes({"A": ["apple", "banana", "cherry"]}), ) def test_search_with_regex(df: Any) -> None: manager = NarwhalsTableManager.from_dataframe(df) @@ -1229,7 +1226,7 @@ def test_search_with_regex(df: Any) -> None: @pytest.mark.skipif(not HAS_DEPS, reason="optional dependencies not installed") @pytest.mark.parametrize( "df", - create_dataframes({"A": [3, 1, None, 2]}, exclude=["ibis", "duckdb"]), + create_dataframes({"A": [3, 1, None, 2]}), ) def test_sort_values_with_nulls(df: Any) -> None: manager = NarwhalsTableManager.from_dataframe(df) @@ -1371,19 +1368,6 @@ def test_calculate_top_k_rows(df: Any) -> None: assert normalized_result == [(3, 3), (None, 2)] -@pytest.mark.skipif( - not DependencyManager.ibis.has(), - reason="Ibis not installed", -) -def test_calculate_top_k_rows_metadata_only_frame() -> None: - import ibis - - df = ibis.memtable({"A": [1, 2, 3, 3, None, None]}) - manager = NarwhalsTableManager.from_dataframe(df) - result = manager.calculate_top_k_rows("A", 10) - assert result == [(None, 2), (3, 2), (1, 1), (2, 1)] - - @pytest.mark.skipif(not HAS_DEPS, reason="optional dependencies not installed") @pytest.mark.parametrize( "df", @@ -1586,10 +1570,7 @@ def test_calculate_top_k_rows_caching(df: Any) -> None: @pytest.mark.skipif(not HAS_DEPS, reason="optional dependencies not installed") @pytest.mark.parametrize( "df", - create_dataframes( - {"name": ["Alice", "Eve", None], "age": [25, 35, None]}, - exclude=["ibis", "duckdb"], - ), + create_dataframes({"name": ["Alice", "Eve", None], "age": [25, 35, None]}), ) def test_calculate_top_k_rows_cache_invalidation(df: Any) -> None: """Test that cache is properly invalidated when data changes.""" diff --git a/tests/_plugins/ui/_impl/tables/test_pandas_table.py b/tests/_plugins/ui/_impl/tables/test_pandas_table.py index bc8bc497a61..1d31bc80032 100644 --- a/tests/_plugins/ui/_impl/tables/test_pandas_table.py +++ b/tests/_plugins/ui/_impl/tables/test_pandas_table.py @@ -611,7 +611,7 @@ def test_summary_number(self) -> None: assert summary == ColumnStats( total=3, nulls=0, - unique=None, + unique=3, min=1.0, max=3.0, mean=2.0, diff --git a/tests/_plugins/ui/_impl/tables/test_polars_table.py b/tests/_plugins/ui/_impl/tables/test_polars_table.py index c581c59c0f5..7513c35fb36 100644 --- a/tests/_plugins/ui/_impl/tables/test_polars_table.py +++ b/tests/_plugins/ui/_impl/tables/test_polars_table.py @@ -400,6 +400,7 @@ def test_stats_number(self) -> None: mean=2.0, median=2.0, std=1.0, + unique=3, p5=1.0, p25=2.0, p75=3.0, diff --git a/tests/_plugins/ui/_impl/test_table.py b/tests/_plugins/ui/_impl/test_table.py index da4461d8183..483b406ebda 100644 --- a/tests/_plugins/ui/_impl/test_table.py +++ b/tests/_plugins/ui/_impl/test_table.py @@ -579,8 +579,7 @@ def test_value_with_cell_selection_then_sorting_dict_of_lists() -> None: "df", create_dataframes({"a": [1, 2, 3]}, include=["ibis"]) ) def test_value_with_cell_selection_unsupported_for_ibis(df: Any) -> None: - with pytest.raises(NotImplementedError): - _table = ui.table(df, selection="multi-cell") + _table = ui.table(df, selection="multi-cell") def test_search_sort_nonexistent_columns() -> None: