|
20 | 20 | import queue |
21 | 21 | import warnings |
22 | 22 |
|
23 | | -from packaging import version |
24 | | - |
25 | 23 | try: |
26 | 24 | import pandas |
27 | 25 | except ImportError: # pragma: NO COVER |
28 | 26 | pandas = None |
29 | 27 |
|
30 | | -try: |
31 | | - import pyarrow |
32 | | - import pyarrow.parquet |
33 | | -except ImportError: # pragma: NO COVER |
34 | | - pyarrow = None |
| 28 | +import pyarrow |
| 29 | +import pyarrow.parquet |
35 | 30 |
|
36 | 31 | try: |
37 | 32 | from google.cloud.bigquery_storage import ArrowSerializationOptions |
@@ -106,63 +101,52 @@ def pyarrow_timestamp(): |
106 | 101 | return pyarrow.timestamp("us", tz="UTC") |
107 | 102 |
|
108 | 103 |
|
109 | | -if pyarrow: |
110 | | - # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
111 | | - # When modifying it be sure to update it there as well. |
112 | | - BQ_TO_ARROW_SCALARS = { |
113 | | - "BOOL": pyarrow.bool_, |
114 | | - "BOOLEAN": pyarrow.bool_, |
115 | | - "BYTES": pyarrow.binary, |
116 | | - "DATE": pyarrow.date32, |
117 | | - "DATETIME": pyarrow_datetime, |
118 | | - "FLOAT": pyarrow.float64, |
119 | | - "FLOAT64": pyarrow.float64, |
120 | | - "GEOGRAPHY": pyarrow.string, |
121 | | - "INT64": pyarrow.int64, |
122 | | - "INTEGER": pyarrow.int64, |
123 | | - "NUMERIC": pyarrow_numeric, |
124 | | - "STRING": pyarrow.string, |
125 | | - "TIME": pyarrow_time, |
126 | | - "TIMESTAMP": pyarrow_timestamp, |
127 | | - } |
128 | | - ARROW_SCALAR_IDS_TO_BQ = { |
129 | | - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
130 | | - pyarrow.bool_().id: "BOOL", |
131 | | - pyarrow.int8().id: "INT64", |
132 | | - pyarrow.int16().id: "INT64", |
133 | | - pyarrow.int32().id: "INT64", |
134 | | - pyarrow.int64().id: "INT64", |
135 | | - pyarrow.uint8().id: "INT64", |
136 | | - pyarrow.uint16().id: "INT64", |
137 | | - pyarrow.uint32().id: "INT64", |
138 | | - pyarrow.uint64().id: "INT64", |
139 | | - pyarrow.float16().id: "FLOAT64", |
140 | | - pyarrow.float32().id: "FLOAT64", |
141 | | - pyarrow.float64().id: "FLOAT64", |
142 | | - pyarrow.time32("ms").id: "TIME", |
143 | | - pyarrow.time64("ns").id: "TIME", |
144 | | - pyarrow.timestamp("ns").id: "TIMESTAMP", |
145 | | - pyarrow.date32().id: "DATE", |
146 | | - pyarrow.date64().id: "DATETIME", # because millisecond resolution |
147 | | - pyarrow.binary().id: "BYTES", |
148 | | - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
149 | | - # The exact scale and precision don't matter, see below. |
150 | | - pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
151 | | - } |
152 | | - |
153 | | - if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): |
154 | | - BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric |
155 | | - # The exact decimal's scale and precision are not important, as only |
156 | | - # the type ID matters, and it's the same for all decimal256 instances. |
157 | | - ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" |
158 | | - _BIGNUMERIC_SUPPORT = True |
159 | | - else: |
160 | | - _BIGNUMERIC_SUPPORT = False |
161 | | - |
162 | | -else: # pragma: NO COVER |
163 | | - BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER |
164 | | - ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER |
165 | | - _BIGNUMERIC_SUPPORT = False # pragma: NO COVER |
| 104 | +# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
| 105 | +# When modifying it be sure to update it there as well. |
| 106 | +BQ_TO_ARROW_SCALARS = { |
| 107 | + "BIGNUMERIC": pyarrow_bignumeric, |
| 108 | + "BOOL": pyarrow.bool_, |
| 109 | + "BOOLEAN": pyarrow.bool_, |
| 110 | + "BYTES": pyarrow.binary, |
| 111 | + "DATE": pyarrow.date32, |
| 112 | + "DATETIME": pyarrow_datetime, |
| 113 | + "FLOAT": pyarrow.float64, |
| 114 | + "FLOAT64": pyarrow.float64, |
| 115 | + "GEOGRAPHY": pyarrow.string, |
| 116 | + "INT64": pyarrow.int64, |
| 117 | + "INTEGER": pyarrow.int64, |
| 118 | + "NUMERIC": pyarrow_numeric, |
| 119 | + "STRING": pyarrow.string, |
| 120 | + "TIME": pyarrow_time, |
| 121 | + "TIMESTAMP": pyarrow_timestamp, |
| 122 | +} |
| 123 | +ARROW_SCALAR_IDS_TO_BQ = { |
| 124 | + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
| 125 | + pyarrow.bool_().id: "BOOL", |
| 126 | + pyarrow.int8().id: "INT64", |
| 127 | + pyarrow.int16().id: "INT64", |
| 128 | + pyarrow.int32().id: "INT64", |
| 129 | + pyarrow.int64().id: "INT64", |
| 130 | + pyarrow.uint8().id: "INT64", |
| 131 | + pyarrow.uint16().id: "INT64", |
| 132 | + pyarrow.uint32().id: "INT64", |
| 133 | + pyarrow.uint64().id: "INT64", |
| 134 | + pyarrow.float16().id: "FLOAT64", |
| 135 | + pyarrow.float32().id: "FLOAT64", |
| 136 | + pyarrow.float64().id: "FLOAT64", |
| 137 | + pyarrow.time32("ms").id: "TIME", |
| 138 | + pyarrow.time64("ns").id: "TIME", |
| 139 | + pyarrow.timestamp("ns").id: "TIMESTAMP", |
| 140 | + pyarrow.date32().id: "DATE", |
| 141 | + pyarrow.date64().id: "DATETIME", # because millisecond resolution |
| 142 | + pyarrow.binary().id: "BYTES", |
| 143 | + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
| 144 | + # The exact scale and precision don't matter, see below. |
| 145 | + pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
| 146 | + # The exact decimal's scale and precision are not important, as only |
| 147 | + # the type ID matters, and it's the same for all decimal256 instances. |
| 148 | + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", |
| 149 | +} |
166 | 150 |
|
167 | 151 |
|
168 | 152 | def bq_to_arrow_struct_data_type(field): |
@@ -346,13 +330,6 @@ def dataframe_to_bq_schema(dataframe, bq_schema): |
346 | 330 | # If schema detection was not successful for all columns, also try with |
347 | 331 | # pyarrow, if available. |
348 | 332 | if unknown_type_fields: |
349 | | - if not pyarrow: |
350 | | - msg = u"Could not determine the type of columns: {}".format( |
351 | | - ", ".join(field.name for field in unknown_type_fields) |
352 | | - ) |
353 | | - warnings.warn(msg) |
354 | | - return None # We cannot detect the schema in full. |
355 | | - |
356 | 333 | # The augment_schema() helper itself will also issue unknown type |
357 | 334 | # warnings if detection still fails for any of the fields. |
358 | 335 | bq_schema_out = augment_schema(dataframe, bq_schema_out) |
@@ -494,9 +471,6 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN |
494 | 471 | serializing method. Defaults to "SNAPPY". |
495 | 472 | https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table |
496 | 473 | """ |
497 | | - if pyarrow is None: |
498 | | - raise ValueError("pyarrow is required for BigQuery schema conversion.") |
499 | | - |
500 | 474 | bq_schema = schema._to_schema_fields(bq_schema) |
501 | 475 | arrow_table = dataframe_to_arrow(dataframe, bq_schema) |
502 | 476 | pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression) |
|
0 commit comments