-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25241][SQL] Configurable empty values when reading/writing CSV files #22234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
8b51800
ebd052b
17eaba6
3d3f178
bb28db9
0bcdb2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -345,11 +345,11 @@ def text(self, paths, wholetext=False, lineSep=None): | |
| @since(2.0) | ||
| def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, | ||
| comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, | ||
| ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, | ||
| negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, | ||
| maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, | ||
| columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, | ||
| samplingRatio=None, enforceSchema=None): | ||
| ignoreTrailingWhiteSpace=None, nullValue=None, emptyValue=None, nanValue=None, | ||
| positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, | ||
| maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, | ||
| mode=None, columnNameOfCorruptRecord=None, multiLine=None, | ||
| charToEscapeQuoteEscaping=None, samplingRatio=None, enforceSchema=None): | ||
| """Loads a CSV file and returns the result as a :class:`DataFrame`. | ||
|
|
||
| This function will go through the input once to determine the input schema if | ||
|
|
@@ -395,6 +395,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non | |
| :param nullValue: sets the string representation of a null value. If None is set, it uses | ||
| the default value, empty string. Since 2.0.1, this ``nullValue`` param | ||
| applies to all supported types including the string type. | ||
| :param emptyValue: sets the string representation of an empty value. If None is set, it uses | ||
| the default value, empty string. | ||
| :param nanValue: sets the string representation of a non-number value. If None is set, it | ||
| uses the default value, ``NaN``. | ||
| :param positiveInf: sets the string representation of a positive infinity value. If None | ||
|
|
@@ -457,9 +459,9 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non | |
| schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment, | ||
| header=header, inferSchema=inferSchema, ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace, | ||
| ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, nullValue=nullValue, | ||
| nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf, | ||
| dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns, | ||
| maxCharsPerColumn=maxCharsPerColumn, | ||
| emptyValue=emptyValue, nanValue=nanValue, positiveInf=positiveInf, | ||
|
||
| negativeInf=negativeInf, dateFormat=dateFormat, timestampFormat=timestampFormat, | ||
| maxColumns=maxColumns, maxCharsPerColumn=maxCharsPerColumn, | ||
| maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, | ||
| columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, | ||
| charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio, | ||
|
|
@@ -857,9 +859,9 @@ def text(self, path, compression=None, lineSep=None): | |
|
|
||
| @since(2.0) | ||
| def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None, | ||
| header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None, | ||
| timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, | ||
| charToEscapeQuoteEscaping=None, encoding=None): | ||
| header=None, nullValue=None, emptyValue=None, escapeQuotes=None, quoteAll=None, | ||
| dateFormat=None, timestampFormat=None, ignoreLeadingWhiteSpace=None, | ||
| ignoreTrailingWhiteSpace=None, charToEscapeQuoteEscaping=None, encoding=None): | ||
| """Saves the content of the :class:`DataFrame` in CSV format at the specified path. | ||
|
|
||
| :param path: the path in any Hadoop supported file system | ||
|
|
@@ -891,6 +893,8 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No | |
| the default value, ``false``. | ||
| :param nullValue: sets the string representation of a null value. If None is set, it uses | ||
| the default value, empty string. | ||
| :param emptyValue: sets the string representation of an empty value. If None is set, it uses | ||
| the default value, ``""``. | ||
| :param dateFormat: sets the string that indicates a date format. Custom date formats | ||
| follow the formats at ``java.text.SimpleDateFormat``. This | ||
| applies to date type. If None is set, it uses the | ||
|
|
@@ -916,8 +920,8 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No | |
| """ | ||
| self.mode(mode) | ||
| self._set_opts(compression=compression, sep=sep, quote=quote, escape=escape, header=header, | ||
| nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll, | ||
| dateFormat=dateFormat, timestampFormat=timestampFormat, | ||
| nullValue=nullValue, emptyValue=emptyValue, escapeQuotes=escapeQuotes, | ||
| quoteAll=quoteAll, dateFormat=dateFormat, timestampFormat=timestampFormat, | ||
| ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace, | ||
| ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, | ||
| charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -117,6 +117,9 @@ class CSVOptions( | |
|
|
||
| val nullValue = parameters.getOrElse("nullValue", "") | ||
|
|
||
| val emptyValueInRead = parameters.getOrElse("emptyValue", "") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would just call it
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I though that as well. Just for the shake of providing backwards compatibility as we already have in
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had to name them differently names because the default values are different. Ah, yea then it makes sense here. I rushed to read. |
||
| val emptyValueInWrite = parameters.getOrElse("emptyValue", "\"\"") | ||
|
|
||
| val nanValue = parameters.getOrElse("nanValue", "NaN") | ||
|
|
||
| val positiveInf = parameters.getOrElse("positiveInf", "Inf") | ||
|
|
@@ -173,7 +176,7 @@ class CSVOptions( | |
| writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite) | ||
| writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite) | ||
| writerSettings.setNullValue(nullValue) | ||
| writerSettings.setEmptyValue("\"\"") | ||
| writerSettings.setEmptyValue(emptyValueInWrite) | ||
| writerSettings.setSkipEmptyLines(true) | ||
| writerSettings.setQuoteAllFields(quoteAll) | ||
| writerSettings.setQuoteEscapingEnabled(escapeQuotes) | ||
|
|
@@ -194,7 +197,7 @@ class CSVOptions( | |
| settings.setInputBufferSize(inputBufferSize) | ||
| settings.setMaxColumns(maxColumns) | ||
| settings.setNullValue(nullValue) | ||
| settings.setEmptyValue("") | ||
| settings.setEmptyValue(emptyValueInRead) | ||
| settings.setMaxCharsPerColumn(maxCharsPerColumn) | ||
| settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) | ||
| settings | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| year,make,model,comment,blank | ||
| "2012","Tesla","S","","" | ||
| 1997,Ford,E350,"Go get one now they are going fast", | ||
| 2015,Chevy,Volt,,"" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should be put at the last; otherwise, it's going to break existing Python app when the arguments are given positionally.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should add new parameter at the end. +1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!