-
Notifications
You must be signed in to change notification settings - Fork 440
Fix schema/type inference issue #216 #244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
a275120
8b4dbdd
b150c55
9be0313
ff96172
d36b4a8
c20b852
18957e2
d41d5fb
9a1f428
80fbac4
bb510a6
2c24965
d6424ef
890fadd
52cd74f
6f90a6b
3d07d82
718b467
5ef29b8
6771184
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| A,B,C,D | ||
| 1,,, | ||
| ,1,, | ||
| ,,1, | ||
| ,,,1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,8 +2,31 @@ package com.databricks.spark.csv.util | |
|
|
||
| import org.apache.spark.sql.types._ | ||
| import org.scalatest.FunSuite | ||
| import org.scalatest.BeforeAndAfterAll | ||
| import org.apache.spark.SparkContext | ||
| import org.apache.spark.sql.SQLContext | ||
| import com.databricks.spark.csv.CsvParser | ||
| import com.databricks.spark.csv.CsvRelation | ||
|
|
||
| class InferSchemaSuite extends FunSuite { | ||
| class InferSchemaSuite extends FunSuite with BeforeAndAfterAll { | ||
|
|
||
| private val simpleDatasetFile = "src/test/resources/simple.csv" | ||
| private val utf8Charset = "utf-8" | ||
| private var sqlContext: SQLContext = _ | ||
|
|
||
| override def beforeAll(): Unit = | ||
|
||
| { | ||
| super.beforeAll() | ||
| sqlContext = new SQLContext(new SparkContext("local[2]", "InferSchemaSuite")) | ||
| } | ||
|
|
||
| override def afterAll(): Unit = { | ||
| try { | ||
| sqlContext.sparkContext.stop() | ||
| } finally { | ||
| super.afterAll() | ||
| } | ||
| } | ||
|
|
||
| test("String fields types are inferred correctly from null types") { | ||
| assert(InferSchema.inferField(NullType, "") == NullType) | ||
|
|
@@ -40,6 +63,14 @@ class InferSchemaSuite extends FunSuite { | |
| assert(InferSchema.inferField(LongType, "2015-08 14:49:00") == StringType) | ||
| } | ||
|
|
||
| test("Merging Nulltypes should yeild Nulltype.") | ||
| { | ||
|
||
| assert( | ||
|
||
| InferSchema.mergeRowTypes(Array(NullType), | ||
| Array(NullType)).deep == Array(NullType).deep) | ||
|
|
||
| } | ||
|
|
||
| test("Type arrays are merged to highest common type") { | ||
| assert( | ||
| InferSchema.mergeRowTypes(Array(StringType), | ||
|
|
@@ -52,4 +83,14 @@ class InferSchemaSuite extends FunSuite { | |
| Array(LongType)).deep == Array(DoubleType).deep) | ||
| } | ||
|
|
||
| test("Type/Schema inference works as expected for the simple parse dataset.") | ||
|
||
| { | ||
| val df = new CsvParser().withUseHeader(true).withInferSchema(true) | ||
| .csvFile(sqlContext, simpleDatasetFile) | ||
| assert( | ||
| df.schema.fields.map{field => field.dataType}.deep == | ||
|
||
| Array(IntegerType, IntegerType, IntegerType, IntegerType).deep | ||
| ) | ||
|
|
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just my personal thought. Could we maybe do this refactoring in a separate PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay. Would make more sense.