[SPARK-10186] [SQL] Add support for array types using JDBCRDD and postgres

mariusvniekerk · mariusvniekerk · commit 72beea6f2361 · 2015-10-15T14:09:14.000-04:00
This change allows reading from jdbc array column types for the postgresql dialect.

This also opens up some implementation for array types using other jdbc backends.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala
@@ -26,6 +26,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLTimestamp
 import org.apache.spark.sql.jdbc.JdbcDialects
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -324,12 +325,13 @@ private[sql] class JDBCRDD(
   case object StringConversion extends JDBCConversion
   case object TimestampConversion extends JDBCConversion
   case object BinaryConversion extends JDBCConversion
+  case class ArrayConversion(elementConversion: JDBCConversion) extends JDBCConversion
 
   /**
-   * Maps a StructType to a type tag list.
+   * Maps a StructField and its associated DataType to a type tag.
    */
-  def getConversions(schema: StructType): Array[JDBCConversion] = {
-    schema.fields.map(sf => sf.dataType match {
+  def getConversion(sf: StructField, dataType: DataType): JDBCConversion = {
+    dataType match {
       case BooleanType => BooleanConversion
       case DateType => DateConversion
       case DecimalType.Fixed(p, s) => DecimalConversion(p, s)
@@ -341,8 +343,16 @@ private[sql] class JDBCRDD(
       case StringType => StringConversion
       case TimestampType => TimestampConversion
       case BinaryType => BinaryConversion
+      case ArrayType(d, x) => ArrayConversion(getConversion(sf, d))
       case _ => throw new IllegalArgumentException(s"Unsupported field $sf")
-    }).toArray
+    }
+  }
+
+  /**
+   * Maps a StructType to a type tag list.
+   */
+  def getConversions(schema: StructType): Array[JDBCConversion] = {
+    schema.fields.map(sf => getConversion(sf, sf.dataType))
   }
 
   /**
@@ -375,6 +385,10 @@ private[sql] class JDBCRDD(
     val conversions = getConversions(schema)
     val mutableRow = new SpecificMutableRow(schema.fields.map(x => x.dataType))
 
+    def convert_date(dateVal: java.sql.Date): Int = DateTimeUtils.fromJavaDate(dateVal)
+    def convert_decimal(decimal: java.math.BigDecimal, p: Int, s: Int): Decimal = Decimal(decimal, p, s)
+    def convert_timestamp(ts: java.sql.Timestamp): SQLTimestamp = DateTimeUtils.fromJavaTimestamp(ts)
+
     def getNext(): InternalRow = {
       if (rs.next()) {
         var i = 0
@@ -386,7 +400,7 @@ private[sql] class JDBCRDD(
               // DateTimeUtils.fromJavaDate does not handle null value, so we need to check it.
               val dateVal = rs.getDate(pos)
               if (dateVal != null) {
-                mutableRow.setInt(i, DateTimeUtils.fromJavaDate(dateVal))
+                mutableRow.setInt(i, convert_date(dateVal))
               } else {
                 mutableRow.update(i, null)
               }
@@ -403,7 +417,7 @@ private[sql] class JDBCRDD(
               if (decimalVal == null) {
                 mutableRow.update(i, null)
               } else {
-                mutableRow.update(i, Decimal(decimalVal, p, s))
+                mutableRow.update(i, convert_decimal(decimalVal, p, s))
               }
             case DoubleConversion => mutableRow.setDouble(i, rs.getDouble(pos))
             case FloatConversion => mutableRow.setFloat(i, rs.getFloat(pos))
@@ -414,21 +428,42 @@ private[sql] class JDBCRDD(
             case TimestampConversion =>
               val t = rs.getTimestamp(pos)
               if (t != null) {
-                mutableRow.setLong(i, DateTimeUtils.fromJavaTimestamp(t))
+                mutableRow.setLong(i, convert_timestamp(t))
               } else {
                 mutableRow.update(i, null)
               }
             case BinaryConversion => mutableRow.update(i, rs.getBytes(pos))
-            case BinaryLongConversion => {
+            case BinaryLongConversion =>
               val bytes = rs.getBytes(pos)
               var ans = 0L
               var j = 0
               while (j < bytes.size) {
                 ans = 256 * ans + (255 & bytes(j))
-                j = j + 1;
+                j = j + 1
               }
               mutableRow.setLong(i, ans)
-            }
+
+            case ArrayConversion(BinaryLongConversion) => throw new IllegalArgumentException(s"Unsupported array element conversion $i")
+            case ArrayConversion(subConvert) =>
+              val a = rs.getArray(pos)
+              if (a != null) {
+                val genericArrayData = a.getArray match {
+                  case x: Array[java.math.BigDecimal] =>
+                    subConvert match {
+                      case DecimalConversion(p, s) => new GenericArrayData(x.map(convert_decimal(_, p, s)))
+                      case _ => throw new IllegalArgumentException("Incompatible decimal conversions")
+                    }
+                  case x: Array[java.sql.Timestamp] => new GenericArrayData(x.map(convert_timestamp))
+                  case x: Array[java.lang.String] => new GenericArrayData(x.map(UTF8String.fromString))
+                  case x: Array[java.sql.Date] => new GenericArrayData(x.map(convert_date))
+                  case x: Array[Any] => new GenericArrayData(x)
+                  case _ => throw new IllegalArgumentException(s"Unsupported arraytype $a")
+                }
+                mutableRow.update(i, genericArrayData)
+              } else {
+                mutableRow.update(i, null)
+              }
+
           }
           if (rs.wasNull) mutableRow.setNullAt(i)
           i = i + 1
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -171,21 +171,9 @@ object JdbcUtils extends Logging {
       val name = field.name
       val typ: String =
         dialect.getJDBCType(field.dataType).map(_.databaseTypeDefinition).getOrElse(
-          field.dataType match {
-            case IntegerType => "INTEGER"
-            case LongType => "BIGINT"
-            case DoubleType => "DOUBLE PRECISION"
-            case FloatType => "REAL"
-            case ShortType => "INTEGER"
-            case ByteType => "BYTE"
-            case BooleanType => "BIT(1)"
-            case StringType => "TEXT"
-            case BinaryType => "BLOB"
-            case TimestampType => "TIMESTAMP"
-            case DateType => "DATE"
-            case t: DecimalType => s"DECIMAL(${t.precision},${t.scale})"
-            case _ => throw new IllegalArgumentException(s"Don't know how to save $field to JDBC")
-          })
+          dialect.getCommonJDBCType(field.dataType).map(_.databaseTypeDefinition).getOrElse(
+            throw new IllegalArgumentException(s"Don't know how to save $field to JDBC")
+          ))
       val nullable = if (field.nullable) "" else "NOT NULL"
       sb.append(s", $name $typ $nullable")
     }}
@@ -203,23 +191,11 @@ object JdbcUtils extends Logging {
     val dialect = JdbcDialects.get(url)
     val nullTypes: Array[Int] = df.schema.fields.map { field =>
       dialect.getJDBCType(field.dataType).map(_.jdbcNullType).getOrElse(
-        field.dataType match {
-          case IntegerType => java.sql.Types.INTEGER
-          case LongType => java.sql.Types.BIGINT
-          case DoubleType => java.sql.Types.DOUBLE
-          case FloatType => java.sql.Types.REAL
-          case ShortType => java.sql.Types.INTEGER
-          case ByteType => java.sql.Types.INTEGER
-          case BooleanType => java.sql.Types.BIT
-          case StringType => java.sql.Types.CLOB
-          case BinaryType => java.sql.Types.BLOB
-          case TimestampType => java.sql.Types.TIMESTAMP
-          case DateType => java.sql.Types.DATE
-          case t: DecimalType => java.sql.Types.DECIMAL
-          case _ => throw new IllegalArgumentException(
+        dialect.getCommonJDBCType(field.dataType).map(_.jdbcNullType).getOrElse(
+          throw new IllegalArgumentException(
             s"Can't translate null value for field $field")
-        })
-    }
+        ))
+        }
 
     val rddSchema = df.schema
     val driver: String = DriverRegistry.getDriverClassName(url)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -81,6 +81,24 @@ abstract class JdbcDialect {
    */
   def getJDBCType(dt: DataType): Option[JdbcType] = None
 
+  def getCommonJDBCType(dataType: DataType): Option[JdbcType] = {
+    dataType match {
+      case IntegerType => Some(JdbcType("INTEGER", java.sql.Types.INTEGER))
+      case LongType => Some(JdbcType("BIGINT", java.sql.Types.BIGINT))
+      case DoubleType => Some(JdbcType("DOUBLE PRECISION", java.sql.Types.DOUBLE))
+      case FloatType => Some(JdbcType("REAL", java.sql.Types.FLOAT))
+      case ShortType => Some(JdbcType("INTEGER", java.sql.Types.SMALLINT))
+      case ByteType => Some(JdbcType("BYTE", java.sql.Types.TINYINT))
+      case BooleanType => Some(JdbcType("BIT(1)", java.sql.Types.BIT))
+      case StringType => Some(JdbcType("TEXT", java.sql.Types.CLOB))
+      case BinaryType => Some(JdbcType("BLOB", java.sql.Types.BLOB))
+      case TimestampType => Some(JdbcType("TIMESTAMP", java.sql.Types.TIMESTAMP))
+      case DateType => Some(JdbcType("DATE", java.sql.Types.DATE))
+      case DecimalType(p, s) => Some(JdbcType(s"DECIMAL($p,$s)", java.sql.Types.DECIMAL))
+      case _ => None
+    }
+  }
+
   /**
    * Quotes the identifier. This is used to put quotes around the identifier in case the column
    * name is a reserved keyword, or in case it contains characters that require quotes (e.g. space).
@@ -207,13 +225,34 @@ case object PostgresDialect extends JdbcDialect {
       Some(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("jsonb")) {
       Some(StringType)
+    } else if (sqlType == Types.ARRAY) {
+      typeName match {
+        case "_bit" => Some(ArrayType(BinaryType))
+        case "_int1" => Some(ArrayType(ByteType))
+        case "_int2" => Some(ArrayType(ShortType))
+        case "_int4" => Some(ArrayType(IntegerType))
+        case "_int8" => Some(ArrayType(LongType))
+        case "_float4" => Some(ArrayType(FloatType))
+        case "_float8" => Some(ArrayType(DoubleType))
+        case "_text" | "_char" | "_varchar" => Some(ArrayType(StringType))
+        case "_timestamp" | "timestamptz" => Some(ArrayType(TimestampType))
+        case "_date" => Some(ArrayType(DateType))
+        case _ => throw new IllegalArgumentException(s"Unhandled postgres array type $typeName")
+      }
     } else None
   }
 
   override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
     case StringType => Some(JdbcType("TEXT", java.sql.Types.CHAR))
     case BinaryType => Some(JdbcType("BYTEA", java.sql.Types.BINARY))
     case BooleanType => Some(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN))
+    case ArrayType(t) =>
+      val subtype = getJDBCType(t).map(_.databaseTypeDefinition).getOrElse(
+        getCommonJDBCType(t).map(_.databaseTypeDefinition).getOrElse(
+          throw new IllegalArgumentException(s"Unexpected JDBC array subtype $t")
+        )
+      )
+      Some(JdbcType(s"$subtype[]", java.sql.Types.ARRAY))
     case _ => None
   }