Minor updates.

yhuai · yhuai · commit 2476ed01fe0a · 2014-07-29T00:16:40.000-07:00
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
@@ -21,9 +21,9 @@
 from py4j.protocol import Py4JError
 
 __all__ = [
-    "StringType", "BinaryType", "BooleanType", "DecimalType", "DoubleType",
-    "FloatType", "ByteType", "IntegerType", "LongType", "ShortType",
-    "ArrayType", "MapType", "StructField", "StructType",
+    "StringType", "BinaryType", "BooleanType", "TimestampType", "DecimalType",
+    "DoubleType", "FloatType", "ByteType", "IntegerType", "LongType",
+    "ShortType", "ArrayType", "MapType", "StructField", "StructType",
     "SQLContext", "HiveContext", "LocalHiveContext", "TestHiveContext", "SchemaRDD", "Row"]
 
 class PrimitiveTypeSingleton(type):
@@ -106,7 +106,7 @@ class FloatType(object):
     Because query evaluation is done in Scala, java.lang.Double will be be used
     for Python float numbers. Because the underlying JVM type of FloatType is
     java.lang.Float (in Java) and Float (in scala), there will be a java.lang.ClassCastException
-    if FloatType (Python) used.
+    if FloatType (Python) is used.
 
     """
     __metaclass__ = PrimitiveTypeSingleton
@@ -121,7 +121,7 @@ class ByteType(object):
     Because query evaluation is done in Scala, java.lang.Integer will be be used
     for Python int numbers. Because the underlying JVM type of ByteType is
     java.lang.Byte (in Java) and Byte (in scala), there will be a java.lang.ClassCastException
-    if ByteType (Python) used.
+    if ByteType (Python) is used.
 
     """
     __metaclass__ = PrimitiveTypeSingleton
@@ -159,7 +159,7 @@ class ShortType(object):
     Because query evaluation is done in Scala, java.lang.Integer will be be used
     for Python int numbers. Because the underlying JVM type of ShortType is
     java.lang.Short (in Java) and Short (in scala), there will be a java.lang.ClassCastException
-    if ShortType (Python) used.
+    if ShortType (Python) is used.
 
     """
     __metaclass__ = PrimitiveTypeSingleton
@@ -171,13 +171,16 @@ class ArrayType(object):
     """Spark SQL ArrayType
 
     The data type representing list values.
+    An ArrayType object comprises two fields, elementType (a DataType) and containsNull (a bool).
+    The field of elementType is used to specify the type of array elements.
+    The field of containsNull is used to specify if the array has None values.
 
     """
     def __init__(self, elementType, containsNull=False):
         """Creates an ArrayType
 
         :param elementType: the data type of elements.
-        :param containsNull: indicates whether the list contains null values.
+        :param containsNull: indicates whether the list contains None values.
         :return:
 
         >>> ArrayType(StringType) == ArrayType(StringType, False)
@@ -205,6 +208,12 @@ class MapType(object):
     """Spark SQL MapType
 
     The data type representing dict values.
+    A MapType object comprises three fields,
+    keyType (a DataType), valueType (a DataType) and valueContainsNull (a bool).
+    The field of keyType is used to specify the type of keys in the map.
+    The field of valueType is used to specify the type of values in the map.
+    The field of valueContainsNull is used to specify if values of this map has None values.
+    For values of a MapType column, keys are not allowed to have None values.
 
     """
     def __init__(self, keyType, valueType, valueContainsNull=True):
@@ -241,6 +250,10 @@ class StructField(object):
     """Spark SQL StructField
 
     Represents a field in a StructType.
+    A StructField object comprises three fields, name (a string), dataType (a DataType),
+    and nullable (a bool). The field of name is the name of a StructField. The field of
+    dataType specifies the data type of a StructField.
+    The field of nullable specifies if values of a StructField can contain None values.
 
     """
     def __init__(self, name, dataType, nullable):
@@ -276,7 +289,8 @@ def __ne__(self, other):
 class StructType(object):
     """Spark SQL StructType
 
-    The data type representing tuple values.
+    The data type representing namedtuple values.
+    A StructType object comprises a list of L{StructField}s.
 
     """
     def __init__(self, fields):
@@ -308,6 +322,11 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
 def _parse_datatype_list(datatype_list_string):
+    """Parses a list of comma separated data types.
+
+    :param datatype_list_string:
+    :return:
+    """
     index = 0
     datatype_list = []
     start = 0
@@ -331,6 +350,7 @@ def _parse_datatype_list(datatype_list_string):
 
 def _parse_datatype_string(datatype_string):
     """Parses the given data type string.
+
     :param datatype_string:
     :return:
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -109,12 +109,12 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool
   object ResolveReferences extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case q: LogicalPlan if q.childrenResolved =>
-        logTrace(s"Attempting to resolve ${q.simpleString}")
+        logger.trace(s"Attempting to resolve ${q.simpleString}")
         q transformExpressions {
           case u @ UnresolvedAttribute(name) =>
             // Leave unchanged if resolution fails.  Hopefully will be resolved next round.
             val result = q.resolve(name).getOrElse(u)
-            logDebug(s"Resolving $u to $result")
+            logger.debug(s"Resolving $u to $result")
             result
         }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -75,7 +75,7 @@ trait HiveTypeCoercion {
             // Leave the same if the dataTypes match.
             case Some(newType) if a.dataType == newType.dataType => a
             case Some(newType) =>
-              logDebug(s"Promoting $a to $newType in ${q.simpleString}}")
+              logger.debug(s"Promoting $a to $newType in ${q.simpleString}}")
               newType
           }
       }
@@ -154,7 +154,7 @@ trait HiveTypeCoercion {
             (Alias(Cast(l, StringType), l.name)(), r)
 
           case (l, r) if l.dataType != r.dataType =>
-            logDebug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
+            logger.debug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}")
             findTightestCommonType(l.dataType, r.dataType).map { widestType =>
               val newLeft =
                 if (l.dataType == widestType) l else Alias(Cast(l, widestType), l.name)()
@@ -170,15 +170,15 @@ trait HiveTypeCoercion {
 
         val newLeft =
           if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) {
-            logDebug(s"Widening numeric types in union $castedLeft ${left.output}")
+            logger.debug(s"Widening numeric types in union $castedLeft ${left.output}")
             Project(castedLeft, left)
           } else {
             left
           }
 
         val newRight =
           if (castedRight.map(_.dataType) != right.output.map(_.dataType)) {
-            logDebug(s"Widening numeric types in union $castedRight ${right.output}")
+            logger.debug(s"Widening numeric types in union $castedRight ${right.output}")
             Project(castedRight, right)
           } else {
             right
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.Logging
 import org.apache.spark.sql.catalyst.trees
 import org.apache.spark.sql.catalyst.errors.attachTree
 import org.apache.spark.sql.catalyst.plans.QueryPlan
@@ -79,7 +79,7 @@ object BindReferences extends Logging {
           // produce new attributes that can't be bound.  Likely the right thing to do is remove
           // this rule and require all operators to explicitly bind to the input schema that
           // they specify.
-          logDebug(s"Couldn't find $a in ${input.mkString("[", ",", "]")}")
+          logger.debug(s"Couldn't find $a in ${input.mkString("[", ",", "]")}")
           a
         } else {
           BoundReference(ordinal, a)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/package.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+package object catalyst {
+  protected[catalyst] type Logging = com.typesafe.scalalogging.slf4j.Logging
+}
+
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.planning
 
-import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.Logging
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -19,8 +19,8 @@ package org.apache.spark.sql.catalyst.planning
 
 import scala.annotation.tailrec
 
-import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.Logging
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 
@@ -113,7 +113,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
 
   def unapply(plan: LogicalPlan): Option[ReturnType] = plan match {
     case join @ Join(left, right, joinType, condition) =>
-      logDebug(s"Considering join on: $condition")
+      logger.debug(s"Considering join on: $condition")
       // Find equi-join predicates that can be evaluated before the join, and thus can be used
       // as join keys.
       val (joinPredicates, otherPredicates) = 
@@ -131,7 +131,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       val rightKeys = joinKeys.map(_._2)
 
       if (joinKeys.nonEmpty) {
-        logDebug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}")
+        logger.debug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}")
         Some((joinType, leftKeys, rightKeys, otherPredicates.reduceOption(And), left, right))
       } else {
         None
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.rules
 
-import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNode
 
 abstract class Rule[TreeType <: TreeNode[_]] extends Logging {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.rules
 
-import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.Logging
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.catalyst.util.sideBySide
 
@@ -60,7 +60,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
           case (plan, rule) =>
             val result = rule(plan)
             if (!result.fastEquals(plan)) {
-              logTrace(
+              logger.trace(
                 s"""
                   |=== Applying Rule ${rule.ruleName} ===
                   |${sideBySide(plan.treeString, result.treeString).mkString("\n")}
@@ -71,25 +71,25 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
         }
         iteration += 1
         if (iteration > batch.strategy.maxIterations) {
-          logInfo(s"Max iterations ($iteration) reached for batch ${batch.name}")
+          logger.info(s"Max iterations ($iteration) reached for batch ${batch.name}")
           continue = false
         }
 
         if (curPlan.fastEquals(lastPlan)) {
-          logTrace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.")
+          logger.trace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.")
           continue = false
         }
         lastPlan = curPlan
       }
 
       if (!batchStartPlan.fastEquals(curPlan)) {
-        logDebug(
+        logger.debug(
           s"""
           |=== Result of Batch ${batch.name} ===
           |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")}
         """.stripMargin)
       } else {
-        logTrace(s"Batch ${batch.name} has no effect.")
+        logger.trace(s"Batch ${batch.name} has no effect.")
       }
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
@@ -359,7 +359,8 @@ case class MapType(
     valueContainsNull: Boolean) extends DataType {
   private[sql] def buildFormattedString(prefix: String, builder: StringBuilder): Unit = {
     builder.append(s"${prefix}-- key: ${keyType.simpleString}\n")
-    builder.append(s"${prefix}-- value: ${valueType.simpleString}\n")
+    builder.append(s"${prefix}-- value: ${valueType.simpleString} " +
+      s"(valueContainsNull = ${valueContainsNull})\n")
     DataType.buildFormattedString(keyType, s"$prefix    |", builder)
     DataType.buildFormattedString(valueType, s"$prefix    |", builder)
   }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/MapType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/MapType.java
@@ -24,6 +24,7 @@
  * The field of {@code valueType} is used to specify the type of values in the map.
  * The field of {@code valueContainsNull} is used to specify if map values have
  * {@code null} values.
+ * For values of a MapType column, keys are not allowed to have {@code null} values.
  *
  * To create a {@link MapType},
  * {@link org.apache.spark.sql.api.java.types.DataType#createMapType(DataType, DataType)} or
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
@@ -137,7 +137,7 @@ class JavaSQLContext(val sqlContext: SQLContext) {
 
   /**
    * Loads an RDD[String] storing JSON objects (one object per record), returning the result as a
-   * [JavaSchemaRDD.
+   * JavaSchemaRDD.
    * It goes through the entire dataset once to determine the schema.
    */
   def jsonRDD(json: JavaRDD[String]): JavaSchemaRDD = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/Row.scala
@@ -110,6 +110,8 @@ class Row(private[spark] val row: ScalaRow) extends Serializable {
 object Row {
 
   private def toJavaValue(value: Any): Any = value match {
+    // For values of this ScalaRow, we will do the conversion when
+    // they are actually accessed.
     case row: ScalaRow => new Row(row)
     case map: scala.collection.Map[_, _] =>
       JavaConversions.mapAsJavaMap(
@@ -125,6 +127,7 @@ object Row {
 
   // TODO: Consolidate the toScalaValue at here with the scalafy in JsonRDD?
   private def toScalaValue(value: Any): Any = value match {
+    // Values of this row have been converted to Scala values.
     case row: Row => row.row
     case map: java.util.Map[_, _] =>
       JMapWrapper(map).map {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -99,7 +99,7 @@ private[sql] case class AddExchange(sqlContext: SQLContext) extends Rule[SparkPl
         !operator.requiredChildDistribution.zip(operator.children).map {
           case (required, child) =>
             val valid = child.outputPartitioning.satisfies(required)
-            logDebug(
+            logger.debug(
               s"${if (valid) "Valid" else "Invalid"} distribution," +
                 s"required: $required current: ${child.outputPartitioning}")
             valid
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
@@ -277,6 +277,7 @@ package object sql {
    * The field of `keyType` is used to specify the type of keys in the map.
    * The field of `valueType` is used to specify the type of values in the map.
    * The field of `valueContainsNull` is used to specify if values of this map has `null` values.
+   * For values of a MapType column, keys are not allowed to have `null` values.
    *
    * @group dataType
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala