From 76d88f43b66caf0c9edfb1b2806e8836163f514b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 20 Jul 2021 14:39:29 -0500 Subject: [PATCH 01/12] feat!: use nullable types like float and Int64 by default in `to_dataframe` To override this behavior, specify the types for the desired columns with the `dtype` argument. --- tests/unit/test_table_pandas.py | 69 +++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/unit/test_table_pandas.py diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py new file mode 100644 index 000000000..a2da48343 --- /dev/null +++ b/tests/unit/test_table_pandas.py @@ -0,0 +1,69 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pytest + +from google.cloud import bigquery + +pandas = pytest.importorskip("pandas") + + +TEST_PATH = "/v1/project/test-proj/dataset/test-dset/table/test-tbl/data" + + +@pytest.fixture +def class_under_test(): + from google.cloud.bigquery.table import RowIterator + + return RowIterator + + +def test_to_dataframe_defaults_to_nullable_dtypes(class_under_test): + nullable_schema = [ + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("datetime_col", "DATETIME"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("float64_col", "FLOAT64"), + bigquery.SchemaField("integer_col", "INTEGER"), + bigquery.SchemaField("int64_col", "INT64"), + bigquery.SchemaField( + "time_col", "TIME" + ), # TODO: use timedelta64 dtype for this? + bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + ] + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema,) + rows.to_dataframe() # TODO: if we are always using BQ Storage API for + # to_dataframe, maybe wait to implement until after required? + # TODO: behavior is based on schema (and data rows) + assert False + + +def test_to_dataframe_bqstorage_defaults_to_nullable_dtypes(class_under_test): + # TODO: behavior is based on schema (and data rows) + assert False + + +def test_to_dataframe_overrides_nullable_dtypes(class_under_test): + """Passing in explicit dtypes is merged with default behavior.""" + assert False + + +def test_to_dataframe_bqstorage_overrides_nullable_dtypes(class_under_test): + """Passing in explicit dtypes is merged with default behavior.""" + assert False From f2223e97d2c83c64ae1a49de40dc5af9300e3e91 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 21 Jul 2021 15:53:27 -0500 Subject: [PATCH 02/12] add test data for all scalar columns --- tests/data/scalars.jsonl | 2 + tests/data/scalars_extreme.jsonl | 4 ++ tests/data/scalars_schema.json | 62 ++++++++++++++++++++++++++++ tests/system/conftest.py | 69 ++++++++++++++++++++++++++++++-- tests/system/test_arrow.py | 56 ++++++++++++++++++++++++++ 5 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 tests/data/scalars.jsonl create mode 100644 tests/data/scalars_extreme.jsonl create mode 100644 tests/data/scalars_schema.json create mode 100644 tests/system/test_arrow.py diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl new file mode 100644 index 000000000..4419a6e9a --- /dev/null +++ b/tests/data/scalars.jsonl @@ -0,0 +1,2 @@ +{"bool_col": true, "bytes_col": "abcd", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "numeric_col": "1.23456789", "bignumeric_col": "10.111213141516171819", "float64_col": "1.25", "string_col": "Hello, World", "time_col": "11:41:43.07616", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_extreme.jsonl b/tests/data/scalars_extreme.jsonl new file mode 100644 index 000000000..c2a923366 --- /dev/null +++ b/tests/data/scalars_extreme.jsonl @@ -0,0 +1,4 @@ +{"bool_col": true, "bytes_col": "abcd", "date_col": "9999-12-31", "datetime_col": "9999-12-31 23:59:59.999999", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "9223372036854775807", "numeric_col": "9.9999999999999999999999999999999999999E+28", "bignumeric_col": "5.7896044618658097711785492504343953926634992332820282019728792003956564819967E+38", "float64_col": "+inf", "string_col": "Hello, World", "time_col": "23:59:59.99999", "timestamp_col": "9999-12-31T23:59:59.999999Z"} +{"bool_col": false, "bytes_col": "abcd", "date_col": "0001-01-01", "datetime_col": "0001-01-01 00:00:00", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "-9223372036854775808", "numeric_col": "-9.9999999999999999999999999999999999999E+28", "bignumeric_col": "-5.7896044618658097711785492504343953926634992332820282019728792003956564819968E+38", "float64_col": "-inf", "string_col": "Hello, World", "time_col": "00:00:00", "timestamp_col": "0001-01-01T00:00:00.000000Z"} +{"bool_col": false, "bytes_col": "", "date_col": "1970-01-01", "datetime_col": "1970-01-01 00:00:00", "geography_col": "POINT(0 0)", "int64_col": "0", "numeric_col": "0.0", "bignumeric_col": "0.0", "float64_col": 0.0, "string_col": "", "time_col": "12:00:00", "timestamp_col": "1970-01-01T00:00:00.000000Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_schema.json b/tests/data/scalars_schema.json new file mode 100644 index 000000000..00bd150fd --- /dev/null +++ b/tests/data/scalars_schema.json @@ -0,0 +1,62 @@ +[ + { + "mode": "NULLABLE", + "name": "timestamp_col", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "time_col", + "type": "TIME" + }, + { + "mode": "NULLABLE", + "name": "float64_col", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "datetime_col", + "type": "DATETIME" + }, + { + "mode": "NULLABLE", + "name": "bignumeric_col", + "type": "BIGNUMERIC" + }, + { + "mode": "NULLABLE", + "name": "numeric_col", + "type": "NUMERIC" + }, + { + "mode": "NULLABLE", + "name": "geography_col", + "type": "GEOGRAPHY" + }, + { + "mode": "NULLABLE", + "name": "date_col", + "type": "DATE" + }, + { + "mode": "NULLABLE", + "name": "string_col", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "bool_col", + "type": "BOOLEAN" + }, + { + "mode": "NULLABLE", + "name": "bytes_col", + "type": "BYTES" + }, + { + "mode": "NULLABLE", + "name": "int64_col", + "type": "INTEGER" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 4eef60e92..e7d8200bb 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -12,11 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pathlib + import pytest +import test_utils.prefixer +from google.cloud import bigquery +from google.cloud.bigquery import enums from . import helpers +prefixer = test_utils.prefixer.Prefixer("python-bigquery", "tests/system") + + +DATA_DIR = pathlib.Path(__file__).parent.parent / "data" + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_datasets(bigquery_client: bigquery.Client): + for dataset in bigquery_client.list_datasets(): + if prefixer.should_cleanup(dataset.dataset_id): + bigquery_client.delete_dataset( + dataset, delete_contents=True, not_found_ok=True + ) + + @pytest.fixture(scope="session") def bigquery_client(): from google.cloud import bigquery @@ -24,6 +44,11 @@ def bigquery_client(): return bigquery.Client() +@pytest.fixture(scope="session") +def project_id(bigquery_client: bigquery.Client): + return bigquery_client.project + + @pytest.fixture(scope="session") def bqstorage_client(bigquery_client): from google.cloud import bigquery_storage @@ -32,13 +57,49 @@ def bqstorage_client(bigquery_client): @pytest.fixture(scope="session") -def dataset_id(bigquery_client): - dataset_id = f"bqsystem_{helpers.temp_suffix()}" - bigquery_client.create_dataset(dataset_id) +def dataset_id(bigquery_client: bigquery.Client, project_id: str): + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + bigquery_client.create_dataset(dataset) yield dataset_id - bigquery_client.delete_dataset(dataset_id, delete_contents=True) + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) @pytest.fixture def table_id(dataset_id): return f"{dataset_id}.table_{helpers.temp_suffix()}" + + +@pytest.fixture(scope="session") +def scalars_table(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars" + with open(DATA_DIR / "scalars.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) + + +@pytest.fixture(scope="session") +def scalars_extreme_table( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars_extreme" + with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py new file mode 100644 index 000000000..5c0104b59 --- /dev/null +++ b/tests/system/test_arrow.py @@ -0,0 +1,56 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for Arrow connector.""" + + +def test_bqstorage(bigquery_client, scalars_table): + arrow_table = bigquery_client.list_rows(scalars_table).to_arrow() + assert arrow_table.schema is None + + # timestamp_col: timestamp[us, tz=UTC] + # time_col: time64[us] + # float64_col: double + # datetime_col: timestamp[us] + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:datetime' + # bignumeric_col: decimal256(76, 38) + # numeric_col: decimal128(38, 9) + # geography_col: string + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:geography' + # ARROW:extension:metadata: '{"encoding": "WKT"}' + # date_col: date32[day] + # string_col: string + # bool_col: bool + # bytes_col: binary + # int64_col: int64 + + +def test_rest(bigquery_client, scalars_table): + arrow_table = bigquery_client.list_rows(scalars_table, max_results=10).to_arrow() + assert arrow_table.schema is None + + # timestamp_col: timestamp[us, tz=UTC] + # time_col: time64[us] + # float64_col: double + # datetime_col: timestamp[us] + # bignumeric_col: decimal256(76, 38) + # numeric_col: decimal128(38, 9) + # geography_col: string + # date_col: date32[day] + # string_col: string + # bool_col: bool + # bytes_col: binary + # int64_col: int64 From 07ed8717370db4659f23c561d6e7637cc37f98c5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 21 Jul 2021 15:53:27 -0500 Subject: [PATCH 03/12] add test data for all scalar columns --- tests/data/scalars.jsonl | 2 + tests/data/scalars_extreme.jsonl | 4 ++ tests/data/scalars_schema.json | 62 ++++++++++++++++++++++++++ tests/system/conftest.py | 69 +++++++++++++++++++++++++++-- tests/system/test_arrow.py | 56 +++++++++++++++++++++++ tests/system/test_pandas.py | 76 ++++++++++++++++++++++++++++++++ 6 files changed, 265 insertions(+), 4 deletions(-) create mode 100644 tests/data/scalars.jsonl create mode 100644 tests/data/scalars_extreme.jsonl create mode 100644 tests/data/scalars_schema.json create mode 100644 tests/system/test_arrow.py diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl new file mode 100644 index 000000000..4419a6e9a --- /dev/null +++ b/tests/data/scalars.jsonl @@ -0,0 +1,2 @@ +{"bool_col": true, "bytes_col": "abcd", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "numeric_col": "1.23456789", "bignumeric_col": "10.111213141516171819", "float64_col": "1.25", "string_col": "Hello, World", "time_col": "11:41:43.07616", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_extreme.jsonl b/tests/data/scalars_extreme.jsonl new file mode 100644 index 000000000..c2a923366 --- /dev/null +++ b/tests/data/scalars_extreme.jsonl @@ -0,0 +1,4 @@ +{"bool_col": true, "bytes_col": "abcd", "date_col": "9999-12-31", "datetime_col": "9999-12-31 23:59:59.999999", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "9223372036854775807", "numeric_col": "9.9999999999999999999999999999999999999E+28", "bignumeric_col": "5.7896044618658097711785492504343953926634992332820282019728792003956564819967E+38", "float64_col": "+inf", "string_col": "Hello, World", "time_col": "23:59:59.99999", "timestamp_col": "9999-12-31T23:59:59.999999Z"} +{"bool_col": false, "bytes_col": "abcd", "date_col": "0001-01-01", "datetime_col": "0001-01-01 00:00:00", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "-9223372036854775808", "numeric_col": "-9.9999999999999999999999999999999999999E+28", "bignumeric_col": "-5.7896044618658097711785492504343953926634992332820282019728792003956564819968E+38", "float64_col": "-inf", "string_col": "Hello, World", "time_col": "00:00:00", "timestamp_col": "0001-01-01T00:00:00.000000Z"} +{"bool_col": false, "bytes_col": "", "date_col": "1970-01-01", "datetime_col": "1970-01-01 00:00:00", "geography_col": "POINT(0 0)", "int64_col": "0", "numeric_col": "0.0", "bignumeric_col": "0.0", "float64_col": 0.0, "string_col": "", "time_col": "12:00:00", "timestamp_col": "1970-01-01T00:00:00.000000Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_schema.json b/tests/data/scalars_schema.json new file mode 100644 index 000000000..00bd150fd --- /dev/null +++ b/tests/data/scalars_schema.json @@ -0,0 +1,62 @@ +[ + { + "mode": "NULLABLE", + "name": "timestamp_col", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "time_col", + "type": "TIME" + }, + { + "mode": "NULLABLE", + "name": "float64_col", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "datetime_col", + "type": "DATETIME" + }, + { + "mode": "NULLABLE", + "name": "bignumeric_col", + "type": "BIGNUMERIC" + }, + { + "mode": "NULLABLE", + "name": "numeric_col", + "type": "NUMERIC" + }, + { + "mode": "NULLABLE", + "name": "geography_col", + "type": "GEOGRAPHY" + }, + { + "mode": "NULLABLE", + "name": "date_col", + "type": "DATE" + }, + { + "mode": "NULLABLE", + "name": "string_col", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "bool_col", + "type": "BOOLEAN" + }, + { + "mode": "NULLABLE", + "name": "bytes_col", + "type": "BYTES" + }, + { + "mode": "NULLABLE", + "name": "int64_col", + "type": "INTEGER" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 4eef60e92..e7d8200bb 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -12,11 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pathlib + import pytest +import test_utils.prefixer +from google.cloud import bigquery +from google.cloud.bigquery import enums from . import helpers +prefixer = test_utils.prefixer.Prefixer("python-bigquery", "tests/system") + + +DATA_DIR = pathlib.Path(__file__).parent.parent / "data" + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_datasets(bigquery_client: bigquery.Client): + for dataset in bigquery_client.list_datasets(): + if prefixer.should_cleanup(dataset.dataset_id): + bigquery_client.delete_dataset( + dataset, delete_contents=True, not_found_ok=True + ) + + @pytest.fixture(scope="session") def bigquery_client(): from google.cloud import bigquery @@ -24,6 +44,11 @@ def bigquery_client(): return bigquery.Client() +@pytest.fixture(scope="session") +def project_id(bigquery_client: bigquery.Client): + return bigquery_client.project + + @pytest.fixture(scope="session") def bqstorage_client(bigquery_client): from google.cloud import bigquery_storage @@ -32,13 +57,49 @@ def bqstorage_client(bigquery_client): @pytest.fixture(scope="session") -def dataset_id(bigquery_client): - dataset_id = f"bqsystem_{helpers.temp_suffix()}" - bigquery_client.create_dataset(dataset_id) +def dataset_id(bigquery_client: bigquery.Client, project_id: str): + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + bigquery_client.create_dataset(dataset) yield dataset_id - bigquery_client.delete_dataset(dataset_id, delete_contents=True) + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) @pytest.fixture def table_id(dataset_id): return f"{dataset_id}.table_{helpers.temp_suffix()}" + + +@pytest.fixture(scope="session") +def scalars_table(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars" + with open(DATA_DIR / "scalars.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) + + +@pytest.fixture(scope="session") +def scalars_extreme_table( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars_extreme" + with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py new file mode 100644 index 000000000..5c0104b59 --- /dev/null +++ b/tests/system/test_arrow.py @@ -0,0 +1,56 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for Arrow connector.""" + + +def test_bqstorage(bigquery_client, scalars_table): + arrow_table = bigquery_client.list_rows(scalars_table).to_arrow() + assert arrow_table.schema is None + + # timestamp_col: timestamp[us, tz=UTC] + # time_col: time64[us] + # float64_col: double + # datetime_col: timestamp[us] + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:datetime' + # bignumeric_col: decimal256(76, 38) + # numeric_col: decimal128(38, 9) + # geography_col: string + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:geography' + # ARROW:extension:metadata: '{"encoding": "WKT"}' + # date_col: date32[day] + # string_col: string + # bool_col: bool + # bytes_col: binary + # int64_col: int64 + + +def test_rest(bigquery_client, scalars_table): + arrow_table = bigquery_client.list_rows(scalars_table, max_results=10).to_arrow() + assert arrow_table.schema is None + + # timestamp_col: timestamp[us, tz=UTC] + # time_col: time64[us] + # float64_col: double + # datetime_col: timestamp[us] + # bignumeric_col: decimal256(76, 38) + # numeric_col: decimal128(38, 9) + # geography_col: string + # date_col: date32[day] + # string_col: string + # bool_col: bool + # bytes_col: binary + # int64_col: int64 diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index ddf5eaf43..3a0896e80 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -792,3 +792,79 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) assert len(dataframe.index) == 100 + + +def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table): + df = bigquery_client.list_rows(scalars_table).to_dataframe() + + # timestamp_col: timestamp[us, tz=UTC] + # time_col: time64[us] + # float64_col: double + # datetime_col: timestamp[us] + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:datetime' + # bignumeric_col: decimal256(76, 38) + # numeric_col: decimal128(38, 9) + # geography_col: string + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:geography' + # ARROW:extension:metadata: '{"encoding": "WKT"}' + # date_col: date32[day] + # string_col: string + # bool_col: bool + # bytes_col: binary + # int64_col: int64 + + assert df.dtypes is None + + # timestamp_col datetime64[ns, UTC] + # time_col object <-- use Period? + # float64_col float64 + # datetime_col datetime64[ns] + # bignumeric_col object <-- probably correct + # numeric_col object <-- probably correct + # geography_col object <-- https://github.com/googleapis/python-bigquery/issues/792 + # date_col object <-- per https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#overview, should be datetime64[ns] (where possible) + # string_col object + # bool_col object <-- maybe should be "boolean" (added in pandas 1.0.0) + # bytes_col object + # int64_col float64 <-- https://github.com/googleapis/python-bigquery/issues/793 + + +def test_list_rows_nullable_scalars_extreme_dtypes( + bigquery_client, scalars_extreme_table +): + df = bigquery_client.list_rows(scalars_extreme_table).to_dataframe() + + # timestamp_col: timestamp[us, tz=UTC] + # time_col: time64[us] + # float64_col: double + # datetime_col: timestamp[us] + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:datetime' + # bignumeric_col: decimal256(76, 38) + # numeric_col: decimal128(38, 9) + # geography_col: string + # -- field metadata -- + # ARROW:extension:name: 'google:sqlType:geography' + # ARROW:extension:metadata: '{"encoding": "WKT"}' + # date_col: date32[day] + # string_col: string + # bool_col: bool + # bytes_col: binary + # int64_col: int64 + + assert df.dtypes is None + + # timestamp_col object + # time_col object + # float64_col float64 + # datetime_col object <-- correct, since extreme values are out-of-bounds + # bignumeric_col object + # numeric_col object + # geography_col object + # date_col object + # string_col object + # bool_col object + # bytes_col object + # int64_col float64 From 21d43698aa44db23ae3d3b4e9c93cc1cec877b17 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 22 Jul 2021 17:07:42 -0500 Subject: [PATCH 04/12] update tests with expected dtypes --- tests/system/test_pandas.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 3a0896e80..da6241171 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -795,7 +795,9 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table): - df = bigquery_client.list_rows(scalars_table).to_dataframe() + df = bigquery_client.list_rows( + scalars_table + ).to_dataframe() # dtypes={"int64_col": "Int64"}) # timestamp_col: timestamp[us, tz=UTC] # time_col: time64[us] @@ -815,7 +817,12 @@ def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table): # bytes_col: binary # int64_col: int64 - assert df.dtypes is None + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["date_col"].name == "datetime64[ns]" + assert df.dtypes["int64_col"].name == "Int64" # timestamp_col datetime64[ns, UTC] # time_col object <-- use Period? @@ -854,8 +861,6 @@ def test_list_rows_nullable_scalars_extreme_dtypes( # bytes_col: binary # int64_col: int64 - assert df.dtypes is None - # timestamp_col object # time_col object # float64_col float64 @@ -868,3 +873,16 @@ def test_list_rows_nullable_scalars_extreme_dtypes( # bool_col object # bytes_col object # int64_col float64 + + # Extreme values are out-of-bounds for pandas datetime64 values, which use + # nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must + # be represented with object. + # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations + assert df.dtypes["date_col"].name == "object" + assert df.dtypes["datetime_col"].name == "object" + assert df.dtypes["timestamp_col"].name == "object" + + # These pandas dtypes can handle the same ranges as BigQuery. + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["int64_col"].name == "Int64" From 69a747f2fa45c12029aa54eea79dd5e95299107d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 23 Jul 2021 15:19:26 -0500 Subject: [PATCH 05/12] add expected types, REST test --- tests/system/test_pandas.py | 110 +++++++++++++----------------------- 1 file changed, 40 insertions(+), 70 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index da6241171..06bf03c6b 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -794,85 +794,46 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): assert len(dataframe.index) == 100 -def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table): +@pytest.mark.parametrize( + ("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API. +) +def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): df = bigquery_client.list_rows( - scalars_table - ).to_dataframe() # dtypes={"int64_col": "Int64"}) - - # timestamp_col: timestamp[us, tz=UTC] - # time_col: time64[us] - # float64_col: double - # datetime_col: timestamp[us] - # -- field metadata -- - # ARROW:extension:name: 'google:sqlType:datetime' - # bignumeric_col: decimal256(76, 38) - # numeric_col: decimal128(38, 9) - # geography_col: string - # -- field metadata -- - # ARROW:extension:name: 'google:sqlType:geography' - # ARROW:extension:metadata: '{"encoding": "WKT"}' - # date_col: date32[day] - # string_col: string - # bool_col: bool - # bytes_col: binary - # int64_col: int64 + scalars_table, max_results=max_results, + ).to_dataframe( + dtypes={ + "bool_col": "boolean", + "date_col": "datetime64[ns]", + "int64_col": "Int64", + } + ) - assert df.dtypes["datetime_col"].name == "datetime64[ns]" - assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" - assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["bool_col"].name == "boolean" assert df.dtypes["date_col"].name == "datetime64[ns]" + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["time_col"].name == "timedelta64[ns]" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["bignumeric_col"].name == "object" + assert df.dtypes["numeric_col"].name == "object" - # timestamp_col datetime64[ns, UTC] - # time_col object <-- use Period? - # float64_col float64 - # datetime_col datetime64[ns] - # bignumeric_col object <-- probably correct - # numeric_col object <-- probably correct - # geography_col object <-- https://github.com/googleapis/python-bigquery/issues/792 - # date_col object <-- per https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#overview, should be datetime64[ns] (where possible) - # string_col object - # bool_col object <-- maybe should be "boolean" (added in pandas 1.0.0) - # bytes_col object - # int64_col float64 <-- https://github.com/googleapis/python-bigquery/issues/793 + # pandas uses Python string and bytes objects. + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["string_col"].name == "object" +@pytest.mark.parametrize( + ("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API. +) def test_list_rows_nullable_scalars_extreme_dtypes( - bigquery_client, scalars_extreme_table + bigquery_client, scalars_extreme_table, max_results ): - df = bigquery_client.list_rows(scalars_extreme_table).to_dataframe() - - # timestamp_col: timestamp[us, tz=UTC] - # time_col: time64[us] - # float64_col: double - # datetime_col: timestamp[us] - # -- field metadata -- - # ARROW:extension:name: 'google:sqlType:datetime' - # bignumeric_col: decimal256(76, 38) - # numeric_col: decimal128(38, 9) - # geography_col: string - # -- field metadata -- - # ARROW:extension:name: 'google:sqlType:geography' - # ARROW:extension:metadata: '{"encoding": "WKT"}' - # date_col: date32[day] - # string_col: string - # bool_col: bool - # bytes_col: binary - # int64_col: int64 - - # timestamp_col object - # time_col object - # float64_col float64 - # datetime_col object <-- correct, since extreme values are out-of-bounds - # bignumeric_col object - # numeric_col object - # geography_col object - # date_col object - # string_col object - # bool_col object - # bytes_col object - # int64_col float64 + df = bigquery_client.list_rows( + scalars_extreme_table, max_results=max_results + ).to_dataframe() # Extreme values are out-of-bounds for pandas datetime64 values, which use # nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must @@ -883,6 +844,15 @@ def test_list_rows_nullable_scalars_extreme_dtypes( assert df.dtypes["timestamp_col"].name == "object" # These pandas dtypes can handle the same ranges as BigQuery. - assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["time_col"].name == "timedelta64[ns]" + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["numeric_col"].name == "object" + assert df.dtypes["bignumeric_col"].name == "object" + + # pandas uses Python string and bytes objects. + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["string_col"].name == "object" From 4f78e6d0e4619005581cb9447e992842ca7d1f62 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 23 Jul 2021 16:29:51 -0500 Subject: [PATCH 06/12] use dtype defaults for "easy" cases --- google/cloud/bigquery/_pandas_helpers.py | 31 ++++++++++++++++++++++++ google/cloud/bigquery/table.py | 10 ++++++++ tests/system/test_pandas.py | 12 +++------ 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 2ff96da4d..a9eb7546d 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -18,6 +18,7 @@ import functools import logging import queue +from typing import Sequence import warnings from packaging import version @@ -56,6 +57,14 @@ _MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads +_BQ_TO_PANDAS_DTYPE_NULLSAFE = { + "BOOL": "boolean", + "BOOLEAN": "boolean", + "FLOAT": "float64", + "FLOAT64": "float64", + "INT64": "Int64", + "INTEGER": "Int64", +} _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", @@ -231,6 +240,28 @@ def bq_to_arrow_schema(bq_schema): return pyarrow.schema(arrow_fields) +def bq_schema_to_nullsafe_pandas_dtypes(bq_schema: Sequence[schema.SchemaField]): + """Return the default dtypes to use for columns in a BigQuery schema. + + Only returns default dtypes which are safe to have NULL values. This + includes Int64, which has pandas.NA values and does not result in + loss-of-precision. + + # TODO: document dtype mapping. + + Returns: + Dict[str, str]: mapping from column names to dtypes + """ + dtypes = {} + for bq_field in bq_schema: + if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}: + continue + field_type = bq_field.field_type.upper() + if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE: + dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type] + return dtypes + + def bq_to_arrow_array(series, bq_field): arrow_type = bq_to_arrow_data_type(bq_field) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 18d969a3f..1f1ea34bc 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1906,6 +1906,14 @@ def to_dataframe( create_bqstorage_client=create_bqstorage_client, ) + # Let the user-defined dtypes override the default ones. + # https://stackoverflow.com/a/26853961/101923 + # TODO: test that this actually doesn't override + default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes( + self.schema + ) + dtypes = {**default_dtypes, **dtypes} + # When converting timestamp values to nanosecond precision, the result # can be out of pyarrow bounds. To avoid the error when converting to # Pandas, we set the timestamp_as_object parameter to True, if necessary. @@ -1931,6 +1939,8 @@ def to_dataframe( for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) + # TODO: convert TIME columns, maybe TIMESTAMP too? Only if dtypes was not set. + return df diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 06bf03c6b..62704e326 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -800,20 +800,14 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): df = bigquery_client.list_rows( scalars_table, max_results=max_results, - ).to_dataframe( - dtypes={ - "bool_col": "boolean", - "date_col": "datetime64[ns]", - "int64_col": "Int64", - } - ) + ).to_dataframe() assert df.dtypes["bool_col"].name == "boolean" - assert df.dtypes["date_col"].name == "datetime64[ns]" + # TODO: assert df.dtypes["date_col"].name == "datetime64[ns]" assert df.dtypes["datetime_col"].name == "datetime64[ns]" assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["int64_col"].name == "Int64" - assert df.dtypes["time_col"].name == "timedelta64[ns]" + # TODO: assert df.dtypes["time_col"].name == "timedelta64[ns]" assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" # decimal.Decimal is used to avoid loss of precision. From d53aa689b905af7135e78ac824d1620fde25e6d1 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 28 Jul 2021 11:49:44 -0500 Subject: [PATCH 07/12] add interval --- google/cloud/bigquery/_pandas_helpers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index f7658c194..73266befa 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -59,6 +59,8 @@ "FLOAT64": "float64", "INT64": "Int64", "INTEGER": "Int64", + "INTERVAL": "timedelta64[ns]", # TODO: What happens when an interval is outside of ns range? + "TIME": "timedelta64[ns]", } _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", From 6ceff2cfe29c47893fc16a01d3b3a4ba228e10cb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 6 Aug 2021 17:01:21 -0500 Subject: [PATCH 08/12] WIP: split TIME and DATE into separate issues --- docs/conf.py | 1 + docs/usage/pandas.rst | 25 ++++++++++++++++++++++-- google/cloud/bigquery/_pandas_helpers.py | 7 ------- google/cloud/bigquery/table.py | 6 +++--- tests/system/test_pandas.py | 20 ++++++++++++++----- 5 files changed, 42 insertions(+), 17 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index cb347160d..09f7ea414 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,6 +110,7 @@ # directories to ignore when looking for source files. exclude_patterns = [ "_build", + "**/.nox/**/*", "samples/AUTHORING_GUIDE.md", "samples/CONTRIBUTING.md", "samples/snippets/README.rst", diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst index 9db98dfbb..b08aebdd6 100644 --- a/docs/usage/pandas.rst +++ b/docs/usage/pandas.rst @@ -14,12 +14,12 @@ First, ensure that the :mod:`pandas` library is installed by running: pip install --upgrade pandas -Alternatively, you can install the BigQuery python client library with +Alternatively, you can install the BigQuery Python client library with :mod:`pandas` by running: .. code-block:: bash - pip install --upgrade google-cloud-bigquery[pandas] + pip install --upgrade 'google-cloud-bigquery[pandas]' To retrieve query results as a :class:`pandas.DataFrame`: @@ -37,6 +37,27 @@ To retrieve table rows as a :class:`pandas.DataFrame`: :start-after: [START bigquery_list_rows_dataframe] :end-before: [END bigquery_list_rows_dataframe] +The following data types are used when creating a pandas DataFrame. + +.. list-table:: Pandas Data Type Mapping + :header-rows: 1 + + * - BigQuery + - pandas + - Notes + * - BOOL + - boolean + - + * - DATETIME + - datetime64[ns], object + - object is used when there are values not representable in pandas + * - FLOAT64 + - float64 + - + * - INT64 + - Int64 + - + Load a Pandas DataFrame to a BigQuery Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 73266befa..a8f491cac 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -43,11 +43,6 @@ _LOGGER = logging.getLogger(__name__) -_NO_BQSTORAGE_ERROR = ( - "The google-cloud-bigquery-storage library is not installed, " - "please install google-cloud-bigquery-storage to use bqstorage features." -) - _PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds. _MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads @@ -59,8 +54,6 @@ "FLOAT64": "float64", "INT64": "Int64", "INTEGER": "Int64", - "INTERVAL": "timedelta64[ns]", # TODO: What happens when an interval is outside of ns range? - "TIME": "timedelta64[ns]", } _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 9bee896bf..42761e7f4 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1924,13 +1924,13 @@ def to_dataframe( extra_kwargs = {"timestamp_as_object": timestamp_as_object} - df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) + df = record_batch.to_pandas( + date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs + ) for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) - # TODO: convert TIME columns, maybe TIMESTAMP too? Only if dtypes was not set. - return df diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 54347f4d6..411c9bed0 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -567,7 +567,7 @@ def test_query_results_to_dataframe(bigquery_client): for _, row in df.iterrows(): for col in column_names: # all the schema fields are nullable, so None is acceptable - if not row[col] is None: + if not pandas.isna(row[col]): assert isinstance(row[col], exp_datatypes[col]) @@ -597,7 +597,7 @@ def test_query_results_to_dataframe_w_bqstorage(bigquery_client): for index, row in df.iterrows(): for col in column_names: # all the schema fields are nullable, so None is acceptable - if not row[col] is None: + if not pandas.isna(row[col]): assert isinstance(row[col], exp_datatypes[col]) @@ -806,13 +806,20 @@ def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_r ).to_dataframe() assert df.dtypes["bool_col"].name == "boolean" - # TODO: assert df.dtypes["date_col"].name == "datetime64[ns]" assert df.dtypes["datetime_col"].name == "datetime64[ns]" assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["int64_col"].name == "Int64" - # TODO: assert df.dtypes["time_col"].name == "timedelta64[ns]" assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + # object is used by default, but we can use "datetime64[ns]" automatically + # when data is within the supported range. + # https://github.com/googleapis/python-bigquery/issues/861 + assert df.dtypes["date_col"].name == "object" + + # object is used by default, but we can use "timedelta64[ns]" automatically + # https://github.com/googleapis/python-bigquery/issues/862 + assert df.dtypes["time_col"].name == "object" + # decimal.Decimal is used to avoid loss of precision. assert df.dtypes["bignumeric_col"].name == "object" assert df.dtypes["numeric_col"].name == "object" @@ -844,7 +851,10 @@ def test_list_rows_nullable_scalars_extreme_dtypes( assert df.dtypes["bool_col"].name == "boolean" assert df.dtypes["float64_col"].name == "float64" assert df.dtypes["int64_col"].name == "Int64" - assert df.dtypes["time_col"].name == "timedelta64[ns]" + + # object is used by default, but we can use "timedelta64[ns]" automatically + # https://github.com/googleapis/python-bigquery/issues/862 + assert df.dtypes["time_col"].name == "object" # decimal.Decimal is used to avoid loss of precision. assert df.dtypes["numeric_col"].name == "object" From 18152d9a2af71b5c1828a830001579f3c4d69cce Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 9 Aug 2021 09:35:08 -0500 Subject: [PATCH 09/12] WIP: unit tests --- google/cloud/bigquery/_pandas_helpers.py | 3 +- tests/unit/test_table_pandas.py | 44 ++++++++++++++---------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index a8f491cac..a3f6c0468 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -47,6 +47,7 @@ _MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads +# If you update the default dtypes, also update the docs at docs/usage/pandas.rst. _BQ_TO_PANDAS_DTYPE_NULLSAFE = { "BOOL": "boolean", "BOOLEAN": "boolean", @@ -228,8 +229,6 @@ def bq_schema_to_nullsafe_pandas_dtypes(bq_schema: Sequence[schema.SchemaField]) includes Int64, which has pandas.NA values and does not result in loss-of-precision. - # TODO: document dtype mapping. - Returns: Dict[str, str]: mapping from column names to dtypes """ diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index a2da48343..a6e320036 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import decimal from unittest import mock +import pyarrow import pytest from google.cloud import bigquery @@ -31,31 +33,35 @@ def class_under_test(): return RowIterator -def test_to_dataframe_defaults_to_nullable_dtypes(class_under_test): +def test_to_dataframe_defaults_to_nullable_dtypes(monkeypatch, class_under_test): + arrow_schema = pyarrow.schema( + [pyarrow.field("bignumeric_col", pyarrow.decimal256(76, scale=38))] + ) + arrow_table = pyarrow.Table.from_pydict( + {"bignumeric_col": [decimal.Decimal("123.456")]}, schema=arrow_schema, + ) + nullable_schema = [ - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("datetime_col", "DATETIME"), - bigquery.SchemaField("float_col", "FLOAT"), - bigquery.SchemaField("float64_col", "FLOAT64"), - bigquery.SchemaField("integer_col", "INTEGER"), - bigquery.SchemaField("int64_col", "INT64"), - bigquery.SchemaField( - "time_col", "TIME" - ), # TODO: use timedelta64 dtype for this? - bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + bigquery.SchemaField("bignumeric_col", "BIGNUMERIC"), + # bigquery.SchemaField("date_col", "DATE"), + # bigquery.SchemaField("datetime_col", "DATETIME"), + # bigquery.SchemaField("float_col", "FLOAT"), + # bigquery.SchemaField("float64_col", "FLOAT64"), + # bigquery.SchemaField("integer_col", "INTEGER"), + # bigquery.SchemaField("int64_col", "INT64"), + # bigquery.SchemaField( "time_col", "TIME"), + # bigquery.SchemaField("timestamp_col", "TIMESTAMP"), ] mock_client = mock.create_autospec(bigquery.Client) mock_client.project = "test-proj" mock_api_request = mock.Mock() - rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema,) - rows.to_dataframe() # TODO: if we are always using BQ Storage API for - # to_dataframe, maybe wait to implement until after required? - # TODO: behavior is based on schema (and data rows) - assert False - + mock_to_arrow = mock.Mock() + mock_to_arrow.return_value = arrow_table + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema) + monkeypatch.setattr(rows, "to_arrow", mock_to_arrow) + rows.to_dataframe() -def test_to_dataframe_bqstorage_defaults_to_nullable_dtypes(class_under_test): - # TODO: behavior is based on schema (and data rows) + # TODO: check dtypes, check values assert False From 2e957cda82f4dafad92eafe48cbd12d2c46957f2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 9 Aug 2021 15:03:56 -0500 Subject: [PATCH 10/12] add tests, update minimum pandas version --- google/cloud/bigquery/table.py | 7 +- setup.py | 2 +- testing/constraints-3.6.txt | 2 +- tests/unit/job/test_query_pandas.py | 22 +---- tests/unit/test_table.py | 8 +- tests/unit/test_table_pandas.py | 133 +++++++++++++++++++++++----- 6 files changed, 125 insertions(+), 49 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 42761e7f4..d2e992dca 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1895,13 +1895,12 @@ def to_dataframe( bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, ) - - # Let the user-defined dtypes override the default ones. - # https://stackoverflow.com/a/26853961/101923 - # TODO: test that this actually doesn't override default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes( self.schema ) + + # Let the user-defined dtypes override the default ones. + # https://stackoverflow.com/a/26853961/101923 dtypes = {**default_dtypes, **dtypes} # When converting timestamp values to nanosecond precision, the result diff --git a/setup.py b/setup.py index 5205b5365..6fa619d37 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ # Keep the no-op bqstorage extra for backward compatibility. # See: https://github.com/googleapis/python-bigquery/issues/757 "bqstorage": [], - "pandas": ["pandas>=0.23.0"], + "pandas": ["pandas>=1.0.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ "opentelemetry-api >= 0.11b0", diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index ce012f0d7..bf1f89f58 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -13,7 +13,7 @@ grpcio==1.38.1 opentelemetry-api==0.11b0 opentelemetry-instrumentation==0.11b0 opentelemetry-sdk==0.11b0 -pandas==0.23.0 +pandas==1.0.0 proto-plus==1.10.0 protobuf==3.12.0 pyarrow==3.0.0 diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index e5105974f..c3a9d2d1a 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -20,11 +20,6 @@ import pyarrow import pytest -try: - import pandas -except (ImportError, AttributeError): # pragma: NO COVER - pandas = None - from google.cloud import bigquery_storage try: @@ -36,6 +31,8 @@ from .helpers import _make_connection from .helpers import _make_job_resource +pandas = pytest.importorskip("pandas") + @pytest.fixture def table_read_options_kwarg(): @@ -78,7 +75,6 @@ def test__contains_order_by(query, expected): assert not mut._contains_order_by(query) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.parametrize( "query", ( @@ -413,7 +409,6 @@ def test_to_arrow_w_tqdm_wo_query_plan(): result_patch_tqdm.assert_called() -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe(): from google.cloud.bigquery.job import QueryJob as target_class @@ -452,7 +447,6 @@ def test_to_dataframe(): assert list(df) == ["name", "age"] # verify the column names -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_ddl_query(): from google.cloud.bigquery.job import QueryJob as target_class @@ -472,7 +466,6 @@ def test_to_dataframe_ddl_query(): assert len(df) == 0 -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_bqstorage(table_read_options_kwarg): from google.cloud.bigquery.job import QueryJob as target_class @@ -522,7 +515,6 @@ def test_to_dataframe_bqstorage(table_read_options_kwarg): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_bqstorage_no_pyarrow_compression(): from google.cloud.bigquery.job import QueryJob as target_class @@ -565,7 +557,6 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression(): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_column_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class @@ -617,15 +608,14 @@ def test_to_dataframe_column_dtypes(): assert list(df) == exp_columns # verify the column names assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" - assert df.seconds.dtype.name == "int64" + assert df.seconds.dtype.name == "Int64" assert df.miles.dtype.name == "float64" assert df.km.dtype.name == "float16" assert df.payment_type.dtype.name == "object" - assert df.complete.dtype.name == "bool" + assert df.complete.dtype.name == "boolean" assert df.date.dtype.name == "object" -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_column_date_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class @@ -657,7 +647,6 @@ def test_to_dataframe_column_date_dtypes(): assert df.date.dtype.name == "datetime64[ns]" -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") @mock.patch("tqdm.tqdm") def test_to_dataframe_with_progress_bar(tqdm_mock): @@ -685,7 +674,6 @@ def test_to_dataframe_with_progress_bar(tqdm_mock): tqdm_mock.assert_called() -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_dataframe_w_tqdm_pending(): from google.cloud.bigquery import table @@ -741,7 +729,6 @@ def test_to_dataframe_w_tqdm_pending(): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_dataframe_w_tqdm(): from google.cloud.bigquery import table @@ -801,7 +788,6 @@ def test_to_dataframe_w_tqdm(): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_dataframe_w_tqdm_max_results(): from google.cloud.bigquery import table diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 20336b227..bd1bdad29 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2578,7 +2578,7 @@ def test_to_dataframe(self): self.assertEqual(len(df), 4) # verify the number of rows self.assertEqual(list(df), ["name", "age"]) # verify the column names self.assertEqual(df.name.dtype.name, "object") - self.assertEqual(df.age.dtype.name, "int64") + self.assertEqual(df.age.dtype.name, "Int64") @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): @@ -2821,7 +2821,7 @@ def test_to_dataframe_w_various_types_nullable(self): self.assertTrue(row.isnull().all()) else: self.assertIsInstance(row.start_timestamp, pandas.Timestamp) - self.assertIsInstance(row.seconds, float) + self.assertIsInstance(row.seconds, int) self.assertIsInstance(row.payment_type, str) self.assertIsInstance(row.complete, bool) self.assertIsInstance(row.date, datetime.date) @@ -2867,11 +2867,11 @@ def test_to_dataframe_column_dtypes(self): self.assertEqual(list(df), exp_columns) # verify the column names self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]") - self.assertEqual(df.seconds.dtype.name, "int64") + self.assertEqual(df.seconds.dtype.name, "Int64") self.assertEqual(df.miles.dtype.name, "float64") self.assertEqual(df.km.dtype.name, "float16") self.assertEqual(df.payment_type.dtype.name, "object") - self.assertEqual(df.complete.dtype.name, "bool") + self.assertEqual(df.complete.dtype.name, "boolean") self.assertEqual(df.date.dtype.name, "object") @mock.patch("google.cloud.bigquery.table.pandas", new=None) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index a6e320036..a9f7ed58a 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import decimal from unittest import mock @@ -34,23 +35,57 @@ def class_under_test(): def test_to_dataframe_defaults_to_nullable_dtypes(monkeypatch, class_under_test): + # See tests/system/test_arrow.py for the actual types we get from the API. arrow_schema = pyarrow.schema( - [pyarrow.field("bignumeric_col", pyarrow.decimal256(76, scale=38))] + [ + pyarrow.field("bignumeric_col", pyarrow.decimal256(76, scale=38)), + pyarrow.field("bool_col", pyarrow.bool_()), + pyarrow.field("bytes_col", pyarrow.binary()), + pyarrow.field("date_col", pyarrow.date32()), + pyarrow.field("datetime_col", pyarrow.timestamp("us", tz=None)), + pyarrow.field("float64_col", pyarrow.float64()), + pyarrow.field("int64_col", pyarrow.int64()), + pyarrow.field("numeric_col", pyarrow.decimal128(38, scale=9)), + pyarrow.field("string_col", pyarrow.string()), + pyarrow.field("time_col", pyarrow.time64("us")), + pyarrow.field( + "timestamp_col", pyarrow.timestamp("us", tz=datetime.timezone.utc) + ), + ] ) arrow_table = pyarrow.Table.from_pydict( - {"bignumeric_col": [decimal.Decimal("123.456")]}, schema=arrow_schema, + { + "bignumeric_col": [decimal.Decimal("123.456789101112131415")], + "bool_col": [True], + "bytes_col": [b"Hello,\x00World!"], + "date_col": [datetime.date(2021, 8, 9)], + "datetime_col": [datetime.datetime(2021, 8, 9, 13, 30, 44, 123456)], + "float64_col": [1.25], + "int64_col": [-7], + "numeric_col": [decimal.Decimal("-123.456789")], + "string_col": ["abcdefg"], + "time_col": [datetime.time(14, 21, 17, 123456)], + "timestamp_col": [ + datetime.datetime( + 2021, 8, 9, 13, 30, 44, 123456, tzinfo=datetime.timezone.utc + ) + ], + }, + schema=arrow_schema, ) nullable_schema = [ bigquery.SchemaField("bignumeric_col", "BIGNUMERIC"), - # bigquery.SchemaField("date_col", "DATE"), - # bigquery.SchemaField("datetime_col", "DATETIME"), - # bigquery.SchemaField("float_col", "FLOAT"), - # bigquery.SchemaField("float64_col", "FLOAT64"), - # bigquery.SchemaField("integer_col", "INTEGER"), - # bigquery.SchemaField("int64_col", "INT64"), - # bigquery.SchemaField( "time_col", "TIME"), - # bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("datetime_col", "DATETIME"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("int64_col", "INT64"), + bigquery.SchemaField("numeric_col", "NUMERIC"), + bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("timestamp_col", "TIMESTAMP"), ] mock_client = mock.create_autospec(bigquery.Client) mock_client.project = "test-proj" @@ -59,17 +94,73 @@ def test_to_dataframe_defaults_to_nullable_dtypes(monkeypatch, class_under_test) mock_to_arrow.return_value = arrow_table rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema) monkeypatch.setattr(rows, "to_arrow", mock_to_arrow) - rows.to_dataframe() - - # TODO: check dtypes, check values - assert False - - -def test_to_dataframe_overrides_nullable_dtypes(class_under_test): + df = rows.to_dataframe() + + # Check for expected dtypes. + # Keep these in sync with tests/system/test_pandas.py + assert df.dtypes["bignumeric_col"].name == "object" + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["date_col"].name == "object" + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["numeric_col"].name == "object" + assert df.dtypes["string_col"].name == "object" + assert df.dtypes["time_col"].name == "object" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + + # Check for expected values. + assert df["bignumeric_col"][0] == decimal.Decimal("123.456789101112131415") + assert df["bool_col"][0] # True + assert df["bytes_col"][0] == b"Hello,\x00World!" + + # object is used by default, but we can use "datetime64[ns]" automatically + # when data is within the supported range. + # https://github.com/googleapis/python-bigquery/issues/861 + assert df["date_col"][0] == datetime.date(2021, 8, 9) + + assert df["datetime_col"][0] == pandas.to_datetime("2021-08-09 13:30:44.123456") + assert df["float64_col"][0] == 1.25 + assert df["int64_col"][0] == -7 + assert df["numeric_col"][0] == decimal.Decimal("-123.456789") + assert df["string_col"][0] == "abcdefg" + + # Pandas timedelta64 might be a better choice for pandas time columns. Then + # they can more easily be combined with date columns to form datetimes. + # https://github.com/googleapis/python-bigquery/issues/862 + assert df["time_col"][0] == datetime.time(14, 21, 17, 123456) + + assert df["timestamp_col"][0] == pandas.to_datetime("2021-08-09 13:30:44.123456Z") + + +def test_to_dataframe_overrides_nullable_dtypes(monkeypatch, class_under_test): """Passing in explicit dtypes is merged with default behavior.""" - assert False + arrow_schema = pyarrow.schema( + [ + pyarrow.field("int64_col", pyarrow.int64()), + pyarrow.field("other_int_col", pyarrow.int64()), + ] + ) + arrow_table = pyarrow.Table.from_pydict( + {"int64_col": [1000], "other_int_col": [-7]}, schema=arrow_schema, + ) + + nullable_schema = [ + bigquery.SchemaField("int64_col", "INT64"), + bigquery.SchemaField("other_int_col", "INT64"), + ] + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + mock_to_arrow = mock.Mock() + mock_to_arrow.return_value = arrow_table + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema) + monkeypatch.setattr(rows, "to_arrow", mock_to_arrow) + df = rows.to_dataframe(dtypes={"other_int_col": "int8"}) + assert df.dtypes["int64_col"].name == "Int64" + assert df["int64_col"][0] == 1000 -def test_to_dataframe_bqstorage_overrides_nullable_dtypes(class_under_test): - """Passing in explicit dtypes is merged with default behavior.""" - assert False + assert df.dtypes["other_int_col"].name == "int8" + assert df["other_int_col"][0] == -7 From 8f90c511d9e97c0341352b71ee4499d886c9c78d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 11 Aug 2021 09:58:15 -0500 Subject: [PATCH 11/12] add unit test for repeated fields --- tests/unit/test_table_pandas.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index a9f7ed58a..a223e6652 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -34,7 +34,7 @@ def class_under_test(): return RowIterator -def test_to_dataframe_defaults_to_nullable_dtypes(monkeypatch, class_under_test): +def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): # See tests/system/test_arrow.py for the actual types we get from the API. arrow_schema = pyarrow.schema( [ @@ -134,7 +134,9 @@ def test_to_dataframe_defaults_to_nullable_dtypes(monkeypatch, class_under_test) assert df["timestamp_col"][0] == pandas.to_datetime("2021-08-09 13:30:44.123456Z") -def test_to_dataframe_overrides_nullable_dtypes(monkeypatch, class_under_test): +def test_to_dataframe_nullable_scalars_with_custom_dtypes( + monkeypatch, class_under_test +): """Passing in explicit dtypes is merged with default behavior.""" arrow_schema = pyarrow.schema( [ @@ -164,3 +166,27 @@ def test_to_dataframe_overrides_nullable_dtypes(monkeypatch, class_under_test): assert df.dtypes["other_int_col"].name == "int8" assert df["other_int_col"][0] == -7 + + +def test_to_dataframe_arrays(monkeypatch, class_under_test): + arrow_schema = pyarrow.schema( + [pyarrow.field("int64_repeated", pyarrow.list_(pyarrow.int64()))] + ) + arrow_table = pyarrow.Table.from_pydict( + {"int64_repeated": [[-1, 0, 2]]}, schema=arrow_schema, + ) + + nullable_schema = [ + bigquery.SchemaField("int64_repeated", "INT64", mode="REPEATED"), + ] + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + mock_to_arrow = mock.Mock() + mock_to_arrow.return_value = arrow_table + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema) + monkeypatch.setattr(rows, "to_arrow", mock_to_arrow) + df = rows.to_dataframe() + + assert df.dtypes["int64_repeated"].name == "object" + assert tuple(df["int64_repeated"][0]) == (-1, 0, 2) From 3155dab5e8a8e6c126152399e5ba5c50dc7ef4e9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 16 Aug 2021 10:07:25 -0500 Subject: [PATCH 12/12] Address docs nits --- docs/usage/pandas.rst | 2 +- google/cloud/bigquery/_pandas_helpers.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst index b08aebdd6..40732a298 100644 --- a/docs/usage/pandas.rst +++ b/docs/usage/pandas.rst @@ -66,7 +66,7 @@ As of version 1.3.0, you can use the to load data from a :class:`pandas.DataFrame` to a :class:`~google.cloud.bigquery.table.Table`. To use this function, in addition to :mod:`pandas`, you will need to install the :mod:`pyarrow` library. You can -install the BigQuery python client library with :mod:`pandas` and +install the BigQuery Python client library with :mod:`pandas` and :mod:`pyarrow` by running: .. code-block:: bash diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index a3f6c0468..88759bd18 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -18,7 +18,7 @@ import functools import logging import queue -from typing import Sequence +from typing import Dict, Sequence import warnings try: @@ -222,7 +222,9 @@ def bq_to_arrow_schema(bq_schema): return pyarrow.schema(arrow_fields) -def bq_schema_to_nullsafe_pandas_dtypes(bq_schema: Sequence[schema.SchemaField]): +def bq_schema_to_nullsafe_pandas_dtypes( + bq_schema: Sequence[schema.SchemaField], +) -> Dict[str, str]: """Return the default dtypes to use for columns in a BigQuery schema. Only returns default dtypes which are safe to have NULL values. This @@ -230,7 +232,7 @@ def bq_schema_to_nullsafe_pandas_dtypes(bq_schema: Sequence[schema.SchemaField]) loss-of-precision. Returns: - Dict[str, str]: mapping from column names to dtypes + A mapping from column names to pandas dtypes. """ dtypes = {} for bq_field in bq_schema: