Skip to content

Commit e0a442b

Browse files
author
Alexey Kudinkin
committed
Fixed reading t/h globbed paths to properly handle case of partitioned tables not using Hive-style partitioning
1 parent 0caadf9 commit e0a442b

1 file changed

Lines changed: 17 additions & 1 deletion

File tree

hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyRelation.scala

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient
2424
import org.apache.hudi.hadoop.HoodieROTablePathFilter
2525
import org.apache.spark.sql.SQLContext
2626
import org.apache.spark.sql.catalyst.expressions.Expression
27+
import org.apache.spark.sql.execution.datasources
2728
import org.apache.spark.sql.execution.datasources._
2829
import org.apache.spark.sql.sources.{BaseRelation, Filter}
2930
import org.apache.spark.sql.types.StructType
@@ -148,6 +149,15 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
148149
val readPathsStr = optParams.get(DataSourceReadOptions.READ_PATHS.key)
149150
val extraReadPaths = readPathsStr.map(p => p.split(",").toSeq).getOrElse(Seq())
150151

152+
// NOTE: Spark is able to infer partitioning values from partition path only when Hive-style partitioning
153+
// scheme is used. Therefore, we fallback to reading the table as non-partitioned (specifying
154+
// partitionColumns = Seq.empty) whenever Hive-style partitioning is not involved
155+
val partitionColumns: Seq[String] = if (tableConfig.getHiveStylePartitioningEnable.toBoolean) {
156+
this.partitionColumns
157+
} else {
158+
Seq.empty
159+
}
160+
151161
DataSource.apply(
152162
sparkSession = sparkSession,
153163
paths = extraReadPaths,
@@ -162,9 +172,15 @@ class BaseFileOnlyRelation(sqlContext: SQLContext,
162172
//
163173
// We rely on [[HoodieROTablePathFilter]], to do proper filtering to assure that
164174
"mapreduce.input.pathFilter.class" -> classOf[HoodieROTablePathFilter].getName,
175+
165176
// We have to override [[EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH]] setting, since
166177
// the relation might have this setting overridden
167-
DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key -> shouldExtractPartitionValuesFromPartitionPath.toString
178+
DataSourceReadOptions.EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH.key -> shouldExtractPartitionValuesFromPartitionPath.toString,
179+
180+
// NOTE: We have to specify table's base-path explicitly, since we're requesting Spark to read it as a
181+
// list of globbed paths which complicates partitioning discovery for Spark.
182+
// Please check [[PartitioningAwareFileIndex#basePaths]] comment for more details.
183+
PartitioningAwareFileIndex.BASE_PATH_PARAM -> metaClient.getBasePathV2.toString
168184
),
169185
partitionColumns = partitionColumns
170186
)

0 commit comments

Comments
 (0)