Skip to content

Commit 399edd8

Browse files
authored
support iceberg introspection through duckdb (#6707)
## 📝 Summary <!-- Provide a concise summary of what this pull request is addressing. If this PR fixes any issues, list them here by number (e.g., Fixes #123). --> Fixes #6688. Also fixes a bug with geometry types. Looks like an upstream bug, I will try to see if we can find metadata to indicate this is a catalog table. ## 🔍 Description of Changes <!-- Detail the specific changes made in this pull request. Explain the problem addressed and how it was resolved. If applicable, provide before and after comparisons, screenshots, or any relevant details to help reviewers understand the changes easily. --> ## 📋 Checklist - [x] I have read the [contributor guidelines](https://github.com/marimo-team/marimo/blob/main/CONTRIBUTING.md). - [ ] For large changes, or changes that affect the public API: this change was discussed or approved through an issue, on [Discord](https://marimo.io/discord?ref=pr), or the community [discussions](https://github.com/marimo-team/marimo/discussions) (Please provide a link if applicable). - [x] I have added tests for the changes made. - [x] I have run the code and verified that it works as expected.
1 parent 8ae5ac7 commit 399edd8

File tree

2 files changed

+92
-25
lines changed

2 files changed

+92
-25
lines changed

marimo/_data/get_datasets.py

Lines changed: 81 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Copyright 2024 Marimo. All rights reserved.
22
from __future__ import annotations
33

4-
from typing import TYPE_CHECKING, Optional, cast
4+
from typing import TYPE_CHECKING, Any, Optional, cast
55

66
from marimo import _loggers
77
from marimo._data.models import (
@@ -86,6 +86,22 @@ def has_updates_to_datasource(query: str) -> bool:
8686
)
8787

8888

89+
def execute_duckdb_query(
90+
connection: Optional[duckdb.DuckDBPyConnection], query: str
91+
) -> list[Any]:
92+
"""Execute a DuckDB query and return the result. Uses connection if provided, otherwise uses duckdb."""
93+
try:
94+
if connection is None:
95+
import duckdb
96+
97+
return duckdb.execute(query).fetchall()
98+
99+
return connection.execute(query).fetchall()
100+
except Exception:
101+
LOGGER.exception("Failed to execute DuckDB query")
102+
return []
103+
104+
89105
def get_databases_from_duckdb(
90106
connection: Optional[duckdb.DuckDBPyConnection],
91107
engine_name: Optional[VariableName] = None,
@@ -109,12 +125,7 @@ def _get_databases_from_duckdb_internal(
109125
# 3:"column_names"
110126
# 4:"column_types"
111127
# 5:"temporary"
112-
if connection is None:
113-
import duckdb
114-
115-
tables_result = duckdb.execute("SHOW ALL TABLES").fetchall()
116-
else:
117-
tables_result = connection.execute("SHOW ALL TABLES").fetchall()
128+
tables_result = execute_duckdb_query(connection, "SHOW ALL TABLES")
118129

119130
if len(tables_result) == 0:
120131
# Return empty databases if there are no tables
@@ -135,6 +146,10 @@ def _get_databases_from_duckdb_internal(
135146

136147
SKIP_TABLES = ["duckdb_functions()", "duckdb_types()", "duckdb_settings()"]
137148

149+
# Bug with Iceberg catalog tables where there is a single column named "__"
150+
# https://github.com/marimo-team/marimo/issues/6688
151+
CATALOG_TABLE_COLUMN_NAME = "__"
152+
138153
for (
139154
database,
140155
schema,
@@ -150,18 +165,26 @@ def _get_databases_from_duckdb_internal(
150165
assert isinstance(column_names, list)
151166
assert isinstance(column_types, list)
152167

153-
columns = [
154-
DataTableColumn(
155-
name=column_name,
156-
type=_db_type_to_data_type(column_type),
157-
external_type=column_type,
158-
sample_values=[],
159-
)
160-
for column_name, column_type in zip(
161-
cast(list[str], column_names),
162-
cast(list[str], column_types),
163-
)
164-
]
168+
catalog_table = (
169+
len(column_names) == 1
170+
and column_names[0] == CATALOG_TABLE_COLUMN_NAME
171+
)
172+
if catalog_table:
173+
qualified_name = f"{database}.{schema}.{name}"
174+
columns = get_table_columns(connection, qualified_name)
175+
else:
176+
columns = [
177+
DataTableColumn(
178+
name=column_name,
179+
type=_db_type_to_data_type(column_type),
180+
external_type=column_type,
181+
sample_values=[],
182+
)
183+
for column_name, column_type in zip(
184+
cast(list[str], column_names),
185+
cast(list[str], column_types),
186+
)
187+
]
165188

166189
table = DataTable(
167190
source_type="duckdb" if engine_name is None else "connection",
@@ -211,6 +234,41 @@ def _get_databases_from_duckdb_internal(
211234
return databases
212235

213236

237+
def get_table_columns(
238+
connection: Optional[duckdb.DuckDBPyConnection], table_name: str
239+
) -> list[DataTableColumn]:
240+
"""Dedicated query to get columns from a table."""
241+
query = f"DESCRIBE TABLE {table_name}"
242+
243+
try:
244+
columns_result = execute_duckdb_query(connection, query)
245+
if len(columns_result) == 0:
246+
return []
247+
248+
columns: list[DataTableColumn] = []
249+
250+
for (
251+
column_name,
252+
column_type,
253+
_null,
254+
_key,
255+
_default,
256+
_extra,
257+
) in columns_result:
258+
column = DataTableColumn(
259+
name=column_name,
260+
type=_db_type_to_data_type(column_type),
261+
external_type=column_type,
262+
sample_values=[],
263+
)
264+
columns.append(column)
265+
return columns
266+
267+
except Exception:
268+
LOGGER.debug("Failed to get columns from DuckDB")
269+
return []
270+
271+
214272
def _get_duckdb_database_names(
215273
connection: Optional[duckdb.DuckDBPyConnection],
216274
) -> list[str]:
@@ -227,12 +285,7 @@ def _get_duckdb_database_names(
227285
database_query = "SELECT * FROM duckdb_databases()"
228286

229287
try:
230-
if connection is None:
231-
import duckdb
232-
233-
databases_result = duckdb.execute(database_query).fetchall()
234-
else:
235-
databases_result = connection.execute(database_query).fetchall()
288+
databases_result = execute_duckdb_query(connection, database_query)
236289
if not len(databases_result):
237290
return []
238291

@@ -326,6 +379,9 @@ def _db_type_to_data_type(db_type: str) -> DataType:
326379
return "string" # Representing bit as string
327380
if db_type == "enum" or db_type.startswith("enum"):
328381
return "string" # Representing enum as string
382+
# Geometry types
383+
if db_type == "geometry":
384+
return "unknown"
329385

330386
LOGGER.warning("Unknown DuckDB type: %s", db_type)
331387
# Unknown type

tests/_data/test_get_datasets.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from marimo._data.get_datasets import (
99
get_databases_from_duckdb,
1010
get_datasets_from_variables,
11+
get_table_columns,
1112
has_updates_to_datasource,
1213
)
1314
from marimo._data.models import Database, DataTable, DataTableColumn, Schema
@@ -514,3 +515,13 @@ def test_get_datasets_from_variables(df: Any) -> None:
514515
],
515516
)
516517
]
518+
519+
520+
def test_get_table_columns() -> None:
521+
import duckdb
522+
523+
connection = duckdb.connect(":memory:")
524+
connection.execute(sql_query)
525+
526+
columns = get_table_columns(connection, "all_types")
527+
assert columns == all_types_tables[0].columns

0 commit comments

Comments
 (0)