-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18362][SQL] Use TextFileFormat in implementation of CSVFileFormat #15813
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
b697436
cfb2f41
0fda0ec
eb8ddfd
acce60d
3082844
4d19978
d688f2d
b01a307
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,10 +27,11 @@ import org.apache.hadoop.mapreduce._ | |
|
|
||
| import org.apache.spark.TaskContext | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.{Dataset, Encoders, SparkSession} | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.util.CompressionCodecs | ||
| import org.apache.spark.sql.execution.datasources._ | ||
| import org.apache.spark.sql.execution.datasources.text.TextFileFormat | ||
| import org.apache.spark.sql.sources._ | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.SerializableConfiguration | ||
|
|
@@ -56,13 +57,16 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { | |
|
|
||
| // TODO: Move filtering. | ||
| val paths = files.filterNot(_.getPath.getName startsWith "_").map(_.getPath.toString) | ||
| val rdd = baseRdd(sparkSession, csvOptions, paths) | ||
| val firstLine = findFirstLine(csvOptions, rdd) | ||
| val lines: Dataset[String] = readText(sparkSession, csvOptions, paths) | ||
| val firstLine: String = findFirstLine(csvOptions, lines) | ||
| val firstRow = new CsvReader(csvOptions).parseLine(firstLine) | ||
| val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis | ||
| val header = makeSafeHeader(firstRow, csvOptions, caseSensitive) | ||
|
|
||
| val parsedRdd = tokenRdd(sparkSession, csvOptions, header, paths) | ||
| val parsedRdd: RDD[Array[String]] = CSVRelation.univocityTokenizer( | ||
| lines, | ||
| firstLine = if (csvOptions.headerFlag) firstLine else null, | ||
| params = csvOptions) | ||
| val schema = if (csvOptions.inferSchemaFlag) { | ||
| CSVInferSchema.infer(parsedRdd, header, csvOptions) | ||
| } else { | ||
|
|
@@ -173,35 +177,17 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { | |
| } | ||
| } | ||
|
|
||
| private def baseRdd( | ||
| sparkSession: SparkSession, | ||
| options: CSVOptions, | ||
| inputPaths: Seq[String]): RDD[String] = { | ||
| readText(sparkSession, options, inputPaths.mkString(",")) | ||
| } | ||
|
|
||
| private def tokenRdd( | ||
| sparkSession: SparkSession, | ||
| options: CSVOptions, | ||
| header: Array[String], | ||
| inputPaths: Seq[String]): RDD[Array[String]] = { | ||
| val rdd = baseRdd(sparkSession, options, inputPaths) | ||
| // Make sure firstLine is materialized before sending to executors | ||
| val firstLine = if (options.headerFlag) findFirstLine(options, rdd) else null | ||
| CSVRelation.univocityTokenizer(rdd, firstLine, options) | ||
| } | ||
|
|
||
| /** | ||
| * Returns the first line of the first non-empty file in path | ||
| */ | ||
| private def findFirstLine(options: CSVOptions, rdd: RDD[String]): String = { | ||
| private def findFirstLine(options: CSVOptions, lines: Dataset[String]): String = { | ||
| if (options.isCommentSet) { | ||
| val comment = options.comment.toString | ||
| rdd.filter { line => | ||
| lines.filter { line => | ||
| line.trim.nonEmpty && !line.startsWith(comment) | ||
| }.first() | ||
| } else { | ||
| rdd.filter { line => | ||
| lines.filter { line => | ||
|
||
| line.trim.nonEmpty | ||
| }.first() | ||
| } | ||
|
|
@@ -210,14 +196,21 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { | |
| private def readText( | ||
| sparkSession: SparkSession, | ||
| options: CSVOptions, | ||
| location: String): RDD[String] = { | ||
| inputPaths: Seq[String]): Dataset[String] = { | ||
| if (Charset.forName(options.charset) == StandardCharsets.UTF_8) { | ||
| sparkSession.sparkContext.textFile(location) | ||
| sparkSession.baseRelationToDataFrame( | ||
| DataSource.apply( | ||
| sparkSession, | ||
| paths = inputPaths, | ||
| className = classOf[TextFileFormat].getName | ||
| ).resolveRelation(checkFilesExist = false)) | ||
| .select("value").as[String](Encoders.STRING) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @JoshRosen, I just happened to look at this one and I am just curious. IIUC, the schema from the So, my question is, is that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I copied this logic from the |
||
| } else { | ||
| val charset = options.charset | ||
| sparkSession.sparkContext | ||
| .hadoopFile[LongWritable, Text, TextInputFormat](location) | ||
| val rdd = sparkSession.sparkContext | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @JoshRosen do you know why the special handling for non-utf8 encoding is needed? I would think TextFileFormat itself already supports that since it is reading it in from Hadoop Text.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure; I think this was a carryover from
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cc @falaki
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| .hadoopFile[LongWritable, Text, TextInputFormat](inputPaths.mkString(",")) | ||
| .mapPartitions(_.map(pair => new String(pair._2.getBytes, 0, pair._2.getLength, charset))) | ||
| sparkSession.createDataset(rdd)(Encoders.STRING) | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using untyped
filtercan be more performant here since we don't need to pay for the extra de/serialization costs: