arjunshroff
diff --git a/‎examples/src/main/python/streaming/direct_kafka_wordcount.py‎
Lines changed: 25 additions & 9 deletions b/‎examples/src/main/python/streaming/direct_kafka_wordcount.py‎
Lines changed: 25 additions & 9 deletions
diff --git a/‎…/streaming/V10DirectKafkaWordCount.scala‎ ‎…/streaming/V09DirectKafkaWordCount.scala‎examples/src/main/scala/org/apache/spark/examples/streaming/V10DirectKafkaWordCount.scala renamed to examples/src/main/scala/org/apache/spark/examples/streaming/V09DirectKafkaWordCount.scala
Lines changed: 2 additions & 2 deletions b/‎…/streaming/V10DirectKafkaWordCount.scala‎ ‎…/streaming/V09DirectKafkaWordCount.scala‎examples/src/main/scala/org/apache/spark/examples/streaming/V10DirectKafkaWordCount.scala renamed to examples/src/main/scala/org/apache/spark/examples/streaming/V09DirectKafkaWordCount.scala
Lines changed: 2 additions & 2 deletions
diff --git a/‎external/kafka-0-9/src/main/scala/org/apache/spark/streaming/kafka09/DirectKafkaInputDStream.scala‎
Lines changed: 1 addition & 2 deletions b/‎external/kafka-0-9/src/main/scala/org/apache/spark/streaming/kafka09/DirectKafkaInputDStream.scala‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎external/kafka-0-9/src/main/scala/org/apache/spark/streaming/kafka09/KafkaUtils.scala‎
Lines changed: 124 additions & 48 deletions b/‎external/kafka-0-9/src/main/scala/org/apache/spark/streaming/kafka09/KafkaUtils.scala‎
Lines changed: 124 additions & 48 deletions
diff --git a/‎external/kafka-0-9/src/main/scala/org/apache/spark/streaming/kafka09/LocationStrategy.scala‎
Lines changed: 1 addition & 1 deletion b/‎external/kafka-0-9/src/main/scala/org/apache/spark/streaming/kafka09/LocationStrategy.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyspark/streaming/kafka.py‎ ‎python/pyspark/streaming/kafka08.py‎python/pyspark/streaming/kafka.py renamed to python/pyspark/streaming/kafka08.py b/‎python/pyspark/streaming/kafka.py‎ ‎python/pyspark/streaming/kafka08.py‎python/pyspark/streaming/kafka.py renamed to python/pyspark/streaming/kafka08.py
@@ -31,25 +31,41 @@
 from __future__ import print_function
 
 import sys
-
 from pyspark import SparkContext
 from pyspark.streaming import StreamingContext
-from pyspark.streaming.kafka import KafkaUtils
+from pyspark.streaming.kafka09 import KafkaUtils
+from pyspark.streaming.kafka09 import ConsumerStrategies
+from pyspark.streaming.kafka09 import LocationStrategies
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr)
-        sys.exit(-1)
+
+    if len(sys.argv) < 4:
+        print("Usage: direct_kafka_wordcount.py <broker_list> <topic> <group_id> " +
+              "<offset_reset> <batch_interval> <poll_timeout>", file=sys.stderr)
+        exit(-1)
+
+    brokers, topic, group_id, offset_reset, batch_interval, poll_timeout = sys.argv[1:]
 
     sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
-    ssc = StreamingContext(sc, 2)
+    ssc = StreamingContext(sc, int(batch_interval))
+
+    kafkaParams = {
+        "bootstrap.servers": brokers,
+        "group.id": group_id,
+        "key.deserializer": "org.apache.kafka.common.serialization.ByteArrayDeserializer",
+        "value.deserializer": "org.apache.kafka.common.serialization.ByteArrayDeserializer",
+        "auto.offset.reset": offset_reset,
+        "enable.auto.commit": "false",
+        "spark.kafka.poll.time": poll_timeout
+    }
 
-    brokers, topic = sys.argv[1:]
-    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
+    consumerStrategy = ConsumerStrategies.Subscribe(sc, [topic], kafkaParams)
+    locationStrategy = LocationStrategies.PreferConsistent(sc)
+    kvs = KafkaUtils.createDirectStream(ssc, locationStrategy, consumerStrategy)
     lines = kvs.map(lambda x: x[1])
     counts = lines.flatMap(lambda line: line.split(" ")) \
         .map(lambda word: (word, 1)) \
-        .reduceByKey(lambda a, b: a+b)
+        .reduceByKey(lambda a, b: a + b)
     counts.pprint()
 
     ssc.start()
 
@@ -42,7 +42,7 @@ import org.apache.spark.streaming.kafka09.{ConsumerStrategies, KafkaUtils, Locat
  *    topic1,topic2 my-consumer-group latest batch-interval pollTimeout
  */
 
-object V10DirectKafkaWordCount {
+object V09DirectKafkaWordCount {
   def main(args: Array[String]) {
     if (args.length < 4) {
       System.err.println(s"""
@@ -90,7 +90,7 @@ object V10DirectKafkaWordCount {
 
     val consumerStrategy = ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams)
     val messages = KafkaUtils.createDirectStream[String, String](
-      ssc, LocationStrategies.PreferConsistent, consumerStrategy)
+      ssc, LocationStrategies.PreferConsistent(), consumerStrategy)
 
     // Get the lines, split them into words, count the words and print
     val lines = messages.map(_.value())
 
@@ -44,7 +44,6 @@ import org.apache.spark.streaming.scheduler.rate.RateEstimator
  * per second that each '''partition''' will accept.
  * @param locationStrategy In most cases, pass in [[PreferConsistent]],
  *   see [[LocationStrategy]] for more details.
- * @param executorKafkaParams Kafka
  * <a href="http://kafka.apache.org/documentation.html#newconsumerconfigs">
  * configuration parameters</a>.
  *   Requires  "bootstrap.servers" to be set with Kafka broker(s),
@@ -110,7 +109,7 @@ private[spark] class DirectKafkaInputDStream[K, V](
   }
 
   // Keep this consistent with how other streams are named (e.g. "Flume polling stream [2]")
-  private[streaming] override def name: String = s"Kafka 0.10 direct stream [$id]"
+  private[streaming] override def name: String = s"Kafka 0.09 direct stream [$id]"
 
   protected[streaming] override val checkpointData =
     new DirectKafkaInputDStreamCheckpointData
 
@@ -17,20 +17,25 @@
 
 package org.apache.spark.streaming.kafka09
 
-import java.{ util => ju }
+import java.{util => ju}
+import java.io.OutputStream
+import java.lang.{Integer => JInt, Long => JLong}
 
+import com.google.common.base.Charsets.UTF_8
+import net.razorvine.pickle.{IObjectPickler, Opcodes, Pickler}
 import org.apache.kafka.clients.consumer._
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.api.java.{ JavaRDD, JavaSparkContext }
-import org.apache.spark.api.java.function.{ Function0 => JFunction0 }
+import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
+import org.apache.spark.api.java.function.{Function0 => JFunction0}
+import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.StreamingContext
-import org.apache.spark.streaming.api.java.{ JavaInputDStream, JavaStreamingContext }
-import org.apache.spark.streaming.dstream._
+import org.apache.spark.streaming.api.java.{JavaDStream, JavaInputDStream, JavaStreamingContext}
+import org.apache.spark.streaming.dstream.InputDStream
 
 /**
  * :: Experimental ::
@@ -76,22 +81,21 @@ object KafkaUtils extends Logging {
   }
 
   /**
-   * :: Experimental ::
-   * Java constructor for a batch-oriented interface for consuming from Kafka.
-   * Starting and ending offsets are specified in advance,
-   * so that you can control exactly-once semantics.
-   * @param keyClass Class of the keys in the Kafka records
-   * @param valueClass Class of the values in the Kafka records
-   * @param kafkaParams Kafka
-   * <a href="http://kafka.apache.org/documentation.html#newconsumerconfigs">
-   * configuration parameters</a>. Requires "bootstrap.servers" to be set
-   * with Kafka broker(s) specified in host1:port1,host2:port2 form.
-   * @param offsetRanges offset ranges that define the Kafka data belonging to this RDD
-   * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
-   *   see [[LocationStrategies]] for more details.
-   * @tparam K type of Kafka message key
-   * @tparam V type of Kafka message value
-   */
+  * :: Experimental ::
+  * Java constructor for a batch-oriented interface for consuming from Kafka.
+  * Starting and ending offsets are specified in advance,
+  * so that you can control exactly-once semantics.
+  *
+  * @param kafkaParams      Kafka
+  *                     <a href="http://kafka.apache.org/documentation.html#newconsumerconfigs">
+  *                         configuration parameters</a>. Requires "bootstrap.servers" to be set
+  *                         with Kafka broker(s) specified in host1:port1,host2:port2 form.
+  * @param offsetRanges     offset ranges that define the Kafka data belonging to this RDD
+  * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
+  *                         see [[LocationStrategies]] for more details.
+  * @tparam K type of Kafka message key
+  * @tparam V type of Kafka message value
+  */
   @Experimental
   def createRDD[K, V](
       jsc: JavaSparkContext,
@@ -104,19 +108,20 @@ object KafkaUtils extends Logging {
   }
 
   /**
-   * :: Experimental ::
-   * Scala constructor for a DStream where
-   * each given Kafka topic/partition corresponds to an RDD partition.
-   * The spark configuration spark.streaming.kafka.maxRatePerPartition gives the maximum number
-   *  of messages
-   * per second that each '''partition''' will accept.
-   * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
-   *   see [[LocationStrategies]] for more details.
-   * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
-   *   see [[ConsumerStrategies]] for more details
-   * @tparam K type of Kafka message key
-   * @tparam V type of Kafka message value
-   */
+  * :: Experimental ::
+  * Scala constructor for a DStream where
+  * each given Kafka topic/partition corresponds to an RDD partition.
+  * The spark configuration spark.streaming.kafka.maxRatePerPartition gives the maximum number
+  * of messages
+  * per second that each '''partition''' will accept.
+  *
+  * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
+  *                         see [[LocationStrategies]] for more details.
+  * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
+  *                         see [[ConsumerStrategies]] for more details
+  * @tparam K type of Kafka message key
+  * @tparam V type of Kafka message value
+  */
   @Experimental
   def createDirectStream[K, V](
       ssc: StreamingContext,
@@ -127,18 +132,17 @@ object KafkaUtils extends Logging {
   }
 
   /**
-   * :: Experimental ::
-   * Java constructor for a DStream where
-   * each given Kafka topic/partition corresponds to an RDD partition.
-   * @param keyClass Class of the keys in the Kafka records
-   * @param valueClass Class of the values in the Kafka records
-   * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
-   *   see [[LocationStrategies]] for more details.
-   * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
-   *   see [[ConsumerStrategies]] for more details
-   * @tparam K type of Kafka message key
-   * @tparam V type of Kafka message value
-   */
+  * :: Experimental ::
+  * Java constructor for a DStream where
+  * each given Kafka topic/partition corresponds to an RDD partition.
+  *
+  * @param locationStrategy In most cases, pass in LocationStrategies.preferConsistent,
+  *                         see [[LocationStrategies]] for more details.
+  * @param consumerStrategy In most cases, pass in ConsumerStrategies.subscribe,
+  *                         see [[ConsumerStrategies]] for more details
+  * @tparam K type of Kafka message key
+  * @tparam V type of Kafka message value
+  */
   @Experimental
   def createDirectStream[K, V](
       jssc: JavaStreamingContext,
@@ -151,8 +155,8 @@ object KafkaUtils extends Logging {
   }
 
   /**
-   * Tweak kafka params to prevent issues on executors
-   */
+  * Tweak kafka params to prevent issues on executors
+  */
   private[kafka09] def fixKafkaParams(kafkaParams: ju.HashMap[String, Object]): Unit = {
     logWarning(s"overriding ${ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG} to false for executor")
     kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false: java.lang.Boolean)
@@ -177,3 +181,75 @@ object KafkaUtils extends Logging {
     }
   }
 }
+
+object KafkaUtilsPythonHelper {
+  private var initialized = false
+
+  def initialize(): Unit = {
+    SerDeUtil.initialize()
+    synchronized {
+      if (!initialized) {
+        new PythonMessageAndMetadataPickler().register()
+        initialized = true
+      }
+    }
+  }
+
+  initialize()
+
+  def picklerIterator(iter: Iterator[ConsumerRecord[Array[Byte], Array[Byte]]]
+    ): Iterator[Array[Byte]] = {
+    new SerDeUtil.AutoBatchedPickler(iter)
+  }
+
+  class PythonMessageAndMetadataPickler extends IObjectPickler {
+    private val module = "pyspark.streaming.kafka"
+
+    def register(): Unit = {
+      Pickler.registerCustomPickler(classOf[ConsumerRecord[Any, Any]], this)
+      Pickler.registerCustomPickler(this.getClass, this)
+    }
+
+    def pickle(obj: Object, out: OutputStream, pickler: Pickler) {
+      if (obj == this) {
+        out.write(Opcodes.GLOBAL)
+        out.write(s"$module\nKafkaMessageAndMetadata\n".getBytes(UTF_8))
+      } else {
+        pickler.save(this)
+        val msgAndMetaData = obj.asInstanceOf[ConsumerRecord[Array[Byte], Array[Byte]]]
+        out.write(Opcodes.MARK)
+        pickler.save(msgAndMetaData.topic)
+        pickler.save(msgAndMetaData.partition)
+        pickler.save(msgAndMetaData.offset)
+        pickler.save(msgAndMetaData.key)
+        pickler.save(msgAndMetaData.value)
+        out.write(Opcodes.TUPLE)
+        out.write(Opcodes.REDUCE)
+      }
+    }
+  }
+
+//  def createRDDWithoutMessageHandler(
+//    jsc: JavaSparkContext,
+//    kafkaParams: JMap[String, String],
+//    offsetRanges: JList[OffsetRange],
+//    leaders: JMap[TopicAndPartition, Broker]): JavaRDD[(Array[Byte], Array[Byte])] = {
+//    val messageHandler =
+//      (mmd: MessageAndMetadata[Array[Byte], Array[Byte]]) => (mmd.key, mmd.message)
+//    new JavaRDD(createRDD(jsc, kafkaParams, offsetRanges, leaders, messageHandler))
+//  }
+
+  @Experimental
+  def createDirectStream(
+      jssc: JavaStreamingContext,
+      locationStrategy: LocationStrategy,
+      consumerStrategy: ConsumerStrategy[Array[Byte], Array[Byte]]
+    ): JavaDStream[(Array[Byte], Array[Byte])] = {
+
+    val dStream = KafkaUtils.createDirectStream[Array[Byte], Array[Byte]](
+      jssc.ssc, locationStrategy, consumerStrategy)
+      .map(cm => (cm.key, cm.value))
+
+    new JavaDStream[(Array[Byte], Array[Byte])](dStream)
+  }
+}
@@ -30,7 +30,7 @@ import org.apache.spark.annotation.Experimental
  *  :: Experimental ::
  * Choice of how to schedule consumers for a given TopicPartition on an executor.
  * See [[LocationStrategies]] to obtain instances.
- * Kafka 0.10 consumers prefetch messages, so it's important for performance
+ * Kafka 0.9 consumers prefetch messages, so it's important for performance
  * to keep cached consumers on appropriate executors, not recreate them for every partition.
  * Choice of location is only a preference, not an absolute; partitions may be scheduled elsewhere.
  */