-
Notifications
You must be signed in to change notification settings - Fork 271
Fallback to CPU when Spark pushes down Aggregates (Min/Max/Count) for ORC #4859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
4b64d77
63114dd
a180080
36b1a64
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -486,4 +486,47 @@ def test_orc_read_with_corrupt_files(spark_tmp_path, reader_confs, v1_enabled_li | |
|
|
||
| assert_gpu_and_cpu_are_equal_collect( | ||
| lambda spark : spark.read.orc([first_data_path, second_data_path, third_data_path]), | ||
| conf=all_confs) | ||
| conf=all_confs) | ||
|
|
||
| conf_for_orc_aggregate_pushdown = { | ||
| "spark.sql.orc.aggregatePushdown": "true", | ||
| "spark.sql.sources.useV1SourceList": "" | ||
| } | ||
|
|
||
| @pytest.mark.skipif(is_before_spark_330(), reason='Aggregate push down on ORC is a new feature of Spark 330') | ||
| @allow_non_gpu(any = True) | ||
| def test_orc_scan_with_aggregation_pushdown_fallback(spark_tmp_path): | ||
| """ | ||
| The aggregation will be pushed down in this test, so we should fallback to CPU | ||
| """ | ||
| data_path = spark_tmp_path + '/pushdown.orc' | ||
|
|
||
| def do_orc_scan(spark): | ||
| df = spark.read.orc(data_path).selectExpr("count(p)") | ||
| return df | ||
|
|
||
| with_cpu_session(lambda spark : spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").mode("overwrite").orc(data_path)) | ||
|
|
||
| assert_cpu_and_gpu_are_equal_collect_with_capture( | ||
| do_orc_scan, | ||
| exist_classes= "BatchScanExec", | ||
| non_exist_classes= "GpuBatchScanExec", | ||
| conf = conf_for_orc_aggregate_pushdown) | ||
|
|
||
| @pytest.mark.skipif(is_before_spark_330(), reason='Aggregate push down on ORC is a new feature of Spark 330') | ||
| def test_orc_scan_without_aggregation_pushdown_not_fallback(spark_tmp_path): | ||
| """ | ||
| No aggregation will be pushed down in this test, so we should not fallback to CPU | ||
| """ | ||
| data_path = spark_tmp_path + "/pushdown.orc" | ||
|
|
||
| def do_orc_scan(spark): | ||
| df = spark.read.orc(data_path).selectExpr("Max(p)") | ||
|
||
| return df | ||
|
|
||
| with_cpu_session(lambda spark : spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").mode("overwrite").orc(data_path)) | ||
|
|
||
| assert_gpu_and_cpu_are_equal_collect( | ||
| do_orc_scan, | ||
| conf_for_orc_aggregate_pushdown | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| /* | ||
| * Copyright (c) 2022, NVIDIA CORPORATION. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.nvidia.spark.rapids.shims.v2 | ||
|
|
||
| import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuOrcScanBase, RapidsConf, RapidsMeta, ScanMeta} | ||
|
|
||
| import org.apache.spark.sql.connector.read.Scan | ||
| import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan | ||
|
|
||
| class RapidsOrcScanMeta( | ||
| oScan: OrcScan, | ||
| conf: RapidsConf, | ||
| parent: Option[RapidsMeta[_, _, _]], | ||
| rule: DataFromReplacementRule) | ||
| extends ScanMeta[OrcScan](oScan, conf, parent, rule) { | ||
|
|
||
| override def tagSelfForGpu(): Unit = { | ||
| GpuOrcScanBase.tagSupport(this) | ||
| } | ||
|
|
||
| override def convertToGpu(): Scan = | ||
| GpuOrcScan(oScan.sparkSession, | ||
| oScan.hadoopConf, | ||
| oScan.fileIndex, | ||
| oScan.dataSchema, | ||
| oScan.readDataSchema, | ||
| oScan.readPartitionSchema, | ||
| oScan.options, | ||
| oScan.pushedFilters, | ||
| oScan.partitionFilters, | ||
| oScan.dataFilters, | ||
| conf) | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| /* | ||
| * Copyright (c) 2022, NVIDIA CORPORATION. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.nvidia.spark.rapids.shims.v2 | ||
|
|
||
| import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuParquetScanBase, RapidsConf, RapidsMeta, ScanMeta} | ||
|
|
||
| import org.apache.spark.sql.connector.read.Scan | ||
| import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan | ||
|
|
||
| class RapidsParquetScanMeta( | ||
| pScan: ParquetScan, | ||
| conf: RapidsConf, | ||
| parent: Option[RapidsMeta[_, _, _]], | ||
| rule: DataFromReplacementRule) | ||
| extends ScanMeta[ParquetScan](pScan, conf, parent, rule) { | ||
|
|
||
| override def tagSelfForGpu(): Unit = { | ||
| GpuParquetScanBase.tagSupport(this) | ||
| } | ||
|
|
||
| override def convertToGpu(): Scan = { | ||
| GpuParquetScan(pScan.sparkSession, | ||
| pScan.hadoopConf, | ||
| pScan.fileIndex, | ||
| pScan.dataSchema, | ||
| pScan.readDataSchema, | ||
| pScan.readPartitionSchema, | ||
| pScan.pushedFilters, | ||
| pScan.options, | ||
| pScan.partitionFilters, | ||
| pScan.dataFilters, | ||
| conf) | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| /* | ||
| * Copyright (c) 2022, NVIDIA CORPORATION. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package com.nvidia.spark.rapids.shims.v2 | ||
|
|
||
| import com.nvidia.spark.rapids.{DataFromReplacementRule, GpuCSVScan, RapidsConf, RapidsMeta, ScanMeta} | ||
|
|
||
| import org.apache.spark.sql.connector.read.{Scan, SupportsRuntimeFiltering} | ||
| import org.apache.spark.sql.execution.datasources.v2.csv.CSVScan | ||
|
|
||
| class RapidsCsvScanMeta( | ||
| cScan: CSVScan, | ||
| conf: RapidsConf, | ||
| parent: Option[RapidsMeta[_, _, _]], | ||
| rule: DataFromReplacementRule) | ||
| extends ScanMeta[CSVScan](cScan, conf, parent, rule) { | ||
|
|
||
| override def tagSelfForGpu(): Unit = { | ||
| GpuCSVScan.tagSupport(this) | ||
| // we are being overly cautious and that Csv does not support this yet | ||
| if (cScan.isInstanceOf[SupportsRuntimeFiltering]) { | ||
| willNotWorkOnGpu("Csv does not support Runtime filtering (DPP)" + | ||
| " on datasource V2 yet.") | ||
| } | ||
| } | ||
|
|
||
| override def convertToGpu(): Scan = | ||
| GpuCSVScan(cScan.sparkSession, | ||
| cScan.fileIndex, | ||
| cScan.dataSchema, | ||
| cScan.readDataSchema, | ||
| cScan.readPartitionSchema, | ||
| cScan.options, | ||
| cScan.partitionFilters, | ||
| cScan.dataFilters, | ||
| conf.maxReadBatchSizeRows, | ||
| conf.maxReadBatchSizeBytes) | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.