apache · pdeyhim · May 26, 2014 · May 26, 2014 · May 26, 2014 · May 26, 2014
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/KinesisWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/KinesisWordCount.scala
@@ -0,0 +1,35 @@
+package org.apache.spark.streaming.examples
+
+import org.apache.spark.streaming.{Seconds, StreamingContext}
+import org.apache.spark.streaming.kinesis.KinesisUtils
+import org.apache.spark.streaming.StreamingContext._
+
+
+object KinesisWordCount {
+
+  def main(args: Array[String]): Unit = {
+
+
+     if (args.length < 1) {
+      System.err.println("Usage: KinesisWordCount <master> <streamname>" + " [accesskey] [accessSecretKey]")
+      System.exit(1)
+    }
+
+	  val master=args(0)
+	  val kinesisStream=args(1)
+	  val accesskey=args(2)
+	  val accessSecretKey=args(3)
+
+
+   val ssc = new StreamingContext(master, "KinesisWordCOunt", Seconds(2),
+      System.getenv("SPARK_HOME"), StreamingContext.jarOfClass(this.getClass))
+
+   val lines = KinesisUtils.createStream(ssc, accesskey, accessSecretKey, kinesisStream)
+
+   val words = lines.flatMap(_.split(" "))
+   val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
+   wordCounts.print
+   ssc.start
+
+  }
+}
diff --git a/external/AmazonKinesis/pom.xml b/external/AmazonKinesis/pom.xml
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent</artifactId>
+    <version>1.0.0-incubating-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-streaming-amazonkinesis</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project External Amazon Kinesis</name>
+  <url>http://spark.incubator.apache.org/</url>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.novocode</groupId>
+      <artifactId>junit-interface</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/...AmazonKinesis/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/...AmazonKinesis/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.streaming.StreamingContext
+import scala.reflect.ClassTag
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.dstream.NetworkReceiver
+import com.amazonaws.auth.AWSCredentialsProvider
+import java.util.UUID
+import com.amazonaws.auth.InstanceProfileCredentialsProvider
+import com.amazonaws.auth.AWSCredentials
+import com.amazonaws.auth.BasicAWSCredentials
+import java.net.UnknownHostException
+import java.net.InetAddress
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration
+import java.nio.charset.Charset
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorFactory
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ShutdownException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.ThrottlingException
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.InvalidStateException
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker
+import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason
+import com.amazonaws.services.kinesis.model.Record
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor
+import org.apache.spark.streaming.dstream.NetworkInputDStream
+import scala.collection.JavaConversions._
+import java.util.List
+
+
+private[streaming]
+class KinesisInputDStream[T: ClassTag](
+    @transient ssc_ : StreamingContext,
+    accesskey:String,
+    accessSecretKey:String,
+    kinesisStream:String,
+    kinesisEndpoint:String,
+    storageLevel: StorageLevel
+  ) extends NetworkInputDStream[String](ssc_)  {
+
+
+  override def getReceiver(): NetworkReceiver[String] = {
+    new KinesisReceiver(accesskey,accessSecretKey,kinesisStream,kinesisEndpoint,storageLevel)
+  }
+}
+
+
+object AllDone extends Exception { }
+
+private[streaming]
+class KinesisReceiver[T: ClassTag](
+    accesskey:String,
+    accessSecretKey:String,
+    kinesisStream:String,
+    kinesisEndpoint:String,
+    storageLevel: StorageLevel
+  ) extends NetworkReceiver[String] {
+
+  val NUM_RETRIES =5
+  val BACKOFF_TIME_IN_MILLIS =2000
+  var workerId = UUID.randomUUID().toString()
+
+  lazy val credentialsProvider = new AWSCredentialsProvider {
+
+       def getCredentials():AWSCredentials = {
+         if (accesskey.isEmpty()||accessSecretKey.isEmpty) {
+           new InstanceProfileCredentialsProvider().getCredentials()
+         }else{
+           new BasicAWSCredentials(accesskey,accessSecretKey)
+         }
+       }
+
+       def refresh() {}
+   }
+
+    try {
+      workerId = InetAddress.getLocalHost().getCanonicalHostName() + ":" + UUID.randomUUID()
+    } catch {
+      case e:UnknownHostException => e.printStackTrace()
+    }
+
+  private lazy val decoder = Charset.forName("UTF-8").newDecoder();
+  private lazy val kinesisClientLibConfiguration =  new KinesisClientLibConfiguration(kinesisStream, kinesisStream, credentialsProvider,workerId).withKinesisEndpoint(kinesisEndpoint)
+  private lazy val blockGenerator = new BlockGenerator(storageLevel)
+
+  protected override def onStart() {
+
+    blockGenerator.start()
+     lazy val recordProcessorFactory:IRecordProcessorFactory = new IRecordProcessorFactory{
+	      def createProcessor():IRecordProcessor= new IRecordProcessor {
+
+	         def initialize(shardId:String){
+	          logInfo("starting with shardId: "+shardId)
+		       }
+
+		       def processRecords(records: List[Record], checkpointer : IRecordProcessorCheckpointer) {	
+		         records.toList.foreach(record=>{
+		           blockGenerator+=decoder.decode(record.getData()).toString();
+		         })
+		          checkpoint(checkpointer);
+		       }
+
+		        def shutdown(checkpointer : IRecordProcessorCheckpointer, reason : ShutdownReason){
+		          logInfo("Shutting Down Kinesis Receiver: "+reason)
+		        }
+	        }	      
+	      }
+     val worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration);
+     worker.run()
+  }
+
+  private def checkpoint(checkpointer : IRecordProcessorCheckpointer) {
+
+    for (i<-1 to NUM_RETRIES) {
+        try {
+                checkpointer.checkpoint();
+                throw AllDone;
+        } catch {
+          case  se:ShutdownException =>logInfo("Caught shutdown exception, skipping checkpoint.", se)
+          case  e:ThrottlingException => {
+                // Backoff and re-attempt checkpoint upon transient failures
+                if (i >= (NUM_RETRIES - 1)) {
+                        logInfo("Checkpoint failed after " + (i + 1) + "attempts.", e)
+                        throw AllDone;
+                } else {
+                        logInfo("Transient issue when checkpointing - attempt " 
+                            + (i + 1) + " of "+ NUM_RETRIES, e)
+                }
+          }
+          case e:InvalidStateException => {
+            logInfo("Cannot save checkpoint to the DynamoDB table used by the Amazon Kinesis Client Library.", e)
+            throw AllDone
+          }
+          case AllDone=>
+        }
+        try {
+                Thread.sleep(BACKOFF_TIME_IN_MILLIS)
+        } catch {
+          case e:InterruptedException => logInfo("Interrupted sleep", e)
+        }
+    }
+  }
+
+  protected override def onStop() {
+    blockGenerator.stop()
+    logInfo("Amazon Kinesis receiver stopped")
+  }
+}
diff --git a/external/AmazonKinesis/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/external/AmazonKinesis/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.streaming.api.java.JavaStreamingContext
+import org.apache.spark.streaming.api.java.JavaDStream
+
+object KinesisUtils {
+
+  def createStream(
+      ssc: StreamingContext,
+      accesskey:String="",
+      accessSecretKey:String="",
+      kinesisStream:String,
+      kinesisEndpoint:String="https://kinesis.us-east-1.amazonaws.com", 
+      storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER_2
+    ): DStream[String] = {
+    new KinesisInputDStream(ssc, accesskey,accessSecretKey,kinesisStream,kinesisEndpoint, storageLevel)
+  }
+  def createStream(
+      jssc: JavaStreamingContext,
+      accesskey:String,
+      accessSecretKey:String,
+      kinesisStream:String,
+      kinesisEndpoint:String, 
+      storageLevel: StorageLevel
+    ): JavaDStream[String] = {
+    new KinesisInputDStream(jssc.ssc, accesskey,accessSecretKey,kinesisStream,kinesisEndpoint, storageLevel)
+  }
+}
diff --git a/...mazonKinesis/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java b/...mazonKinesis/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisStreamSuite.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis;
+
+
+import org.junit.Test;
+import org.apache.spark.storage.StorageLevel;
+import org.apache.spark.streaming.LocalJavaStreamingContext;
+import org.apache.spark.streaming.api.java.JavaDStream;
+
+public class JavaKinesisStreamSuite extends LocalJavaStreamingContext {
+  @Test
+  public void testKinesisStream() {
+
+    JavaDStream<String> test1 = KinesisUtils.createStream(ssc,
+      "x", "y", "z","1",StorageLevel.MEMORY_AND_DISK_SER_2());
+  }
+}
+
diff --git a/.../AmazonKinesis/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/.../AmazonKinesis/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kinesis
+
+import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
+import org.apache.spark.storage.StorageLevel
+
+class KinesisStreamSuite extends TestSuiteBase {
+
+  test("Kinesis input stream") {
+    val ssc = new StreamingContext(master, framework, batchDuration)
+    val test1 = KinesisUtils.createStream(ssc, accesskey="x",accessSecretKey="y",kinesisStream="z")
+    ssc.stop()
+  } 
+}