-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-12244][SPARK-12245][STREAMING] Rename trackStateByKey to mapWithState and change tracking function signature #10224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
68d1c64
7c37c2c
13b8cb4
4e9f778
495b982
6f5c694
6d29b7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,7 @@ import org.apache.spark.{HashPartitioner, Partitioner} | |
| /** | ||
| * :: Experimental :: | ||
| * Abstract class representing all the specifications of the DStream transformation | ||
| * `trackStateByKey` operation of a | ||
| * `mapWithState` operation of a | ||
| * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a | ||
| * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java). | ||
| * Use the [[org.apache.spark.streaming.StateSpec StateSpec.apply()]] or | ||
|
|
@@ -37,42 +37,47 @@ import org.apache.spark.{HashPartitioner, Partitioner} | |
| * | ||
| * Example in Scala: | ||
| * {{{ | ||
| * def trackingFunction(data: Option[ValueType], wrappedState: State[StateType]): EmittedType = { | ||
| * def mappingFunction(data: Option[ValueType], wrappedState: State[StateType]): MappedType = { | ||
| * ... | ||
| * } | ||
| * | ||
| * val spec = StateSpec.function(trackingFunction).numPartitions(10) | ||
| * val spec = StateSpec.function(mappingFunction).numPartitions(10) | ||
| * | ||
| * val emittedRecordDStream = keyValueDStream.trackStateByKey[StateType, EmittedDataType](spec) | ||
| * val mapWithStateDStream = keyValueDStream.mapWithState[StateType, MappedType](spec) | ||
| * }}} | ||
| * | ||
| * Example in Java: | ||
| * {{{ | ||
| * StateSpec<KeyType, ValueType, StateType, EmittedDataType> spec = | ||
| * StateSpec.<KeyType, ValueType, StateType, EmittedDataType>function(trackingFunction) | ||
| * StateSpec<KeyType, ValueType, StateType, MappedType> spec = | ||
| * StateSpec.<KeyType, ValueType, StateType, MappedType>function(mappingFunction) | ||
| * .numPartition(10); | ||
| * | ||
| * JavaTrackStateDStream<KeyType, ValueType, StateType, EmittedType> emittedRecordDStream = | ||
| * javaPairDStream.<StateType, EmittedDataType>trackStateByKey(spec); | ||
| * JavaMapWithStateDStream<KeyType, ValueType, StateType, MappedType> mapWithStateDStream = | ||
| * javaPairDStream.<StateType, MappedType>mapWithState(spec); | ||
| * }}} | ||
| * | ||
| * @tparam KeyType Class of the state key | ||
| * @tparam ValueType Class of the state value | ||
| * @tparam StateType Class of the state data | ||
| * @tparam MappedType Class of the mapped elements | ||
| */ | ||
| @Experimental | ||
| sealed abstract class StateSpec[KeyType, ValueType, StateType, EmittedType] extends Serializable { | ||
| sealed abstract class StateSpec[KeyType, ValueType, StateType, MappedType] extends Serializable { | ||
|
|
||
| /** Set the RDD containing the initial states that will be used by `trackStateByKey` */ | ||
| /** Set the RDD containing the initial states that will be used by `mapWithState` */ | ||
| def initialState(rdd: RDD[(KeyType, StateType)]): this.type | ||
|
|
||
| /** Set the RDD containing the initial states that will be used by `trackStateByKey` */ | ||
| /** Set the RDD containing the initial states that will be used by `mapWithState` */ | ||
| def initialState(javaPairRDD: JavaPairRDD[KeyType, StateType]): this.type | ||
|
|
||
| /** | ||
| * Set the number of partitions by which the state RDDs generated by `trackStateByKey` | ||
| * Set the number of partitions by which the state RDDs generated by `mapWithState` | ||
| * will be partitioned. Hash partitioning will be used. | ||
| */ | ||
| def numPartitions(numPartitions: Int): this.type | ||
|
|
||
| /** | ||
| * Set the partitioner by which the state RDDs generated by `trackStateByKey` will be | ||
| * Set the partitioner by which the state RDDs generated by `mapWithState` will be | ||
| * be partitioned. | ||
| */ | ||
| def partitioner(partitioner: Partitioner): this.type | ||
|
|
@@ -91,113 +96,114 @@ sealed abstract class StateSpec[KeyType, ValueType, StateType, EmittedType] exte | |
| /** | ||
| * :: Experimental :: | ||
| * Builder object for creating instances of [[org.apache.spark.streaming.StateSpec StateSpec]] | ||
| * that is used for specifying the parameters of the DStream transformation `trackStateByKey` | ||
| * that is used for specifying the parameters of the DStream transformation `mapWithState` | ||
| * that is used for specifying the parameters of the DStream transformation | ||
| * `trackStateByKey` operation of a | ||
| * `mapWithState` operation of a | ||
| * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]] (Scala) or a | ||
| * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]] (Java). | ||
| * | ||
| * Example in Scala: | ||
| * {{{ | ||
| * def trackingFunction(data: Option[ValueType], wrappedState: State[StateType]): EmittedType = { | ||
| * def mappingFunction(data: Option[ValueType], wrappedState: State[StateType]): MappedType = { | ||
| * ... | ||
| * } | ||
| * | ||
| * val emittedRecordDStream = keyValueDStream.trackStateByKey[StateType, EmittedDataType]( | ||
| * StateSpec.function(trackingFunction).numPartitions(10)) | ||
| * val spec = StateSpec.function(mappingFunction).numPartitions(10) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I remember this line cannot be put here because the compiler cannot infer
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am modifying the signature of the function to have the key. See jira SPARK-12245. So then this should not be a problem. |
||
| * | ||
| * val mapWithStateDStream = keyValueDStream.mapWithState[StateType, MappedType](spec) | ||
| * }}} | ||
| * | ||
| * Example in Java: | ||
| * {{{ | ||
| * StateSpec<KeyType, ValueType, StateType, EmittedDataType> spec = | ||
| * StateSpec.<KeyType, ValueType, StateType, EmittedDataType>function(trackingFunction) | ||
| * StateSpec<KeyType, ValueType, StateType, MappedType> spec = | ||
| * StateSpec.<KeyType, ValueType, StateType, MappedType>function(mappingFunction) | ||
| * .numPartition(10); | ||
| * | ||
| * JavaTrackStateDStream<KeyType, ValueType, StateType, EmittedType> emittedRecordDStream = | ||
| * javaPairDStream.<StateType, EmittedDataType>trackStateByKey(spec); | ||
| * JavaMapWithStateDStream<KeyType, ValueType, StateType, MappedType> mapWithStateDStream = | ||
| * javaPairDStream.<StateType, MappedType>mapWithState(spec); | ||
| * }}} | ||
| */ | ||
| @Experimental | ||
| object StateSpec { | ||
| /** | ||
| * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all the specifications | ||
| * of the `trackStateByKey` operation on a | ||
| * of the `mapWithState` operation on a | ||
| * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]]. | ||
| * | ||
| * @param trackingFunction The function applied on every data item to manage the associated state | ||
| * and generate the emitted data | ||
| * @param mappingFunction The function applied on every data item to manage the associated state | ||
| * and generate the mapped data | ||
| * @tparam KeyType Class of the keys | ||
| * @tparam ValueType Class of the values | ||
| * @tparam StateType Class of the states data | ||
| * @tparam EmittedType Class of the emitted data | ||
| * @tparam MappedType Class of the mapped data | ||
| */ | ||
| def function[KeyType, ValueType, StateType, EmittedType]( | ||
| trackingFunction: (Time, KeyType, Option[ValueType], State[StateType]) => Option[EmittedType] | ||
| ): StateSpec[KeyType, ValueType, StateType, EmittedType] = { | ||
| ClosureCleaner.clean(trackingFunction, checkSerializable = true) | ||
| new StateSpecImpl(trackingFunction) | ||
| def function[KeyType, ValueType, StateType, MappedType]( | ||
| mappingFunction: (Time, KeyType, Option[ValueType], State[StateType]) => Option[MappedType] | ||
| ): StateSpec[KeyType, ValueType, StateType, MappedType] = { | ||
| ClosureCleaner.clean(mappingFunction, checkSerializable = true) | ||
| new StateSpecImpl(mappingFunction) | ||
| } | ||
|
|
||
| /** | ||
| * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all the specifications | ||
| * of the `trackStateByKey` operation on a | ||
| * of the `mapWithState` operation on a | ||
| * [[org.apache.spark.streaming.dstream.PairDStreamFunctions pair DStream]]. | ||
| * | ||
| * @param trackingFunction The function applied on every data item to manage the associated state | ||
| * and generate the emitted data | ||
| * @param mappingFunction The function applied on every data item to manage the associated state | ||
| * and generate the mapped data | ||
| * @tparam ValueType Class of the values | ||
| * @tparam StateType Class of the states data | ||
| * @tparam EmittedType Class of the emitted data | ||
| * @tparam MappedType Class of the mapped data | ||
| */ | ||
| def function[KeyType, ValueType, StateType, EmittedType]( | ||
| trackingFunction: (Option[ValueType], State[StateType]) => EmittedType | ||
| ): StateSpec[KeyType, ValueType, StateType, EmittedType] = { | ||
| ClosureCleaner.clean(trackingFunction, checkSerializable = true) | ||
| def function[KeyType, ValueType, StateType, MappedType]( | ||
| mappingFunction: (Option[ValueType], State[StateType]) => MappedType | ||
| ): StateSpec[KeyType, ValueType, StateType, MappedType] = { | ||
| ClosureCleaner.clean(mappingFunction, checkSerializable = true) | ||
| val wrappedFunction = | ||
| (time: Time, key: Any, value: Option[ValueType], state: State[StateType]) => { | ||
| Some(trackingFunction(value, state)) | ||
| Some(mappingFunction(value, state)) | ||
| } | ||
| new StateSpecImpl(wrappedFunction) | ||
| } | ||
|
|
||
| /** | ||
| * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all | ||
| * the specifications of the `trackStateByKey` operation on a | ||
| * the specifications of the `mapWithState` operation on a | ||
| * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]]. | ||
| * | ||
| * @param javaTrackingFunction The function applied on every data item to manage the associated | ||
| * state and generate the emitted data | ||
| * @param mappingFunction The function applied on every data item to manage the associated | ||
| * state and generate the mapped data | ||
| * @tparam KeyType Class of the keys | ||
| * @tparam ValueType Class of the values | ||
| * @tparam StateType Class of the states data | ||
| * @tparam EmittedType Class of the emitted data | ||
| * @tparam MappedType Class of the mapped data | ||
| */ | ||
| def function[KeyType, ValueType, StateType, EmittedType](javaTrackingFunction: | ||
| JFunction4[Time, KeyType, Optional[ValueType], State[StateType], Optional[EmittedType]]): | ||
| StateSpec[KeyType, ValueType, StateType, EmittedType] = { | ||
| def function[KeyType, ValueType, StateType, MappedType](mappingFunction: | ||
| JFunction4[Time, KeyType, Optional[ValueType], State[StateType], Optional[MappedType]]): | ||
| StateSpec[KeyType, ValueType, StateType, MappedType] = { | ||
| val trackingFunc = (time: Time, k: KeyType, v: Option[ValueType], s: State[StateType]) => { | ||
| val t = javaTrackingFunction.call(time, k, JavaUtils.optionToOptional(v), s) | ||
| val t = mappingFunction.call(time, k, JavaUtils.optionToOptional(v), s) | ||
| Option(t.orNull) | ||
| } | ||
| StateSpec.function(trackingFunc) | ||
| } | ||
|
|
||
| /** | ||
| * Create a [[org.apache.spark.streaming.StateSpec StateSpec]] for setting all the specifications | ||
| * of the `trackStateByKey` operation on a | ||
| * of the `mapWithState` operation on a | ||
| * [[org.apache.spark.streaming.api.java.JavaPairDStream JavaPairDStream]]. | ||
| * | ||
| * @param javaTrackingFunction The function applied on every data item to manage the associated | ||
| * state and generate the emitted data | ||
| * @param mappingFunction The function applied on every data item to manage the associated | ||
| * state and generate the mapped data | ||
| * @tparam ValueType Class of the values | ||
| * @tparam StateType Class of the states data | ||
| * @tparam EmittedType Class of the emitted data | ||
| * @tparam MappedType Class of the mapped data | ||
| */ | ||
| def function[KeyType, ValueType, StateType, EmittedType]( | ||
| javaTrackingFunction: JFunction2[Optional[ValueType], State[StateType], EmittedType]): | ||
| StateSpec[KeyType, ValueType, StateType, EmittedType] = { | ||
| def function[KeyType, ValueType, StateType, MappedType]( | ||
| mappingFunction: JFunction2[Optional[ValueType], State[StateType], MappedType]): | ||
| StateSpec[KeyType, ValueType, StateType, MappedType] = { | ||
| val trackingFunc = (v: Option[ValueType], s: State[StateType]) => { | ||
| javaTrackingFunction.call(Optional.fromNullable(v.get), s) | ||
| mappingFunction.call(Optional.fromNullable(v.get), s) | ||
| } | ||
| StateSpec.function(trackingFunc) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: the method name
testTrackStateByAPIshould be renamed totestMapWithStateAPI