-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-35897][SS] Support user defined initial state with flatMapGroupsWithState in Structured Streaming #33093
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
6c1443c
ec2b972
f221358
673008d
a25ea47
ef50c79
7629d2e
e2234a8
7d7b5a9
57c9c2f
4e85a62
0374bdb
b8c70ab
b47ac23
eb83b68
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,12 @@ object UnsupportedOperationChecker extends Logging { | |
| case p if p.isStreaming => | ||
| throwError("Queries with streaming sources must be executed with writeStream.start()")(p) | ||
|
|
||
| case f: FlatMapGroupsWithState => | ||
| if (f.hasInitialState) { | ||
| throwError("Batch [flatMap|map]GroupsWithState queries should not" + | ||
| " pass an initial state.")(f) | ||
| } | ||
|
|
||
| case _ => | ||
| } | ||
| } | ||
|
|
@@ -232,6 +238,10 @@ object UnsupportedOperationChecker extends Logging { | |
| // Check compatibility with output modes and aggregations in query | ||
| val aggsInQuery = collectStreamingAggregates(plan) | ||
|
|
||
| if (m.initialState.isStreaming) { | ||
| // initial state has to be a batch relation | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Non-streaming DataFrame/Dataset is not supported as the initial state in [flatMap|map]GroupsWithState operation on a streamiing DataFrame/Dataset
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| throwError("Initial state cannot be a streaming DataFrame/Dataset.") | ||
| } | ||
| if (m.isMapGroupsWithState) { // check mapGroupsWithState | ||
| // allowed only in update query output mode and without aggregation | ||
| if (aggsInQuery.nonEmpty) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -440,7 +440,7 @@ object FlatMapGroupsWithState { | |
| isMapGroupsWithState: Boolean, | ||
| timeout: GroupStateTimeout, | ||
| child: LogicalPlan): LogicalPlan = { | ||
| val encoder = encoderFor[S] | ||
| val stateEncoder = encoderFor[S] | ||
|
|
||
| val mapped = new FlatMapGroupsWithState( | ||
| func, | ||
|
|
@@ -449,10 +449,49 @@ object FlatMapGroupsWithState { | |
| groupingAttributes, | ||
| dataAttributes, | ||
| CatalystSerde.generateObjAttr[U], | ||
| encoder.asInstanceOf[ExpressionEncoder[Any]], | ||
| stateEncoder.asInstanceOf[ExpressionEncoder[Any]], | ||
| outputMode, | ||
| isMapGroupsWithState, | ||
| timeout, | ||
| hasInitialState = false, | ||
| groupingAttributes, | ||
| dataAttributes, | ||
| UnresolvedDeserializer(encoderFor[K].deserializer, groupingAttributes), | ||
| LocalRelation(stateEncoder.schema.toAttributes), // empty data set | ||
| child | ||
| ) | ||
| CatalystSerde.serialize[U](mapped) | ||
| } | ||
|
|
||
| def apply[K: Encoder, V: Encoder, S: Encoder, U: Encoder]( | ||
| func: (Any, Iterator[Any], LogicalGroupState[Any]) => Iterator[Any], | ||
| groupingAttributes: Seq[Attribute], | ||
| dataAttributes: Seq[Attribute], | ||
| outputMode: OutputMode, | ||
| isMapGroupsWithState: Boolean, | ||
| timeout: GroupStateTimeout, | ||
| child: LogicalPlan, | ||
| initialStateGroupAttrs: Seq[Attribute], | ||
| initialStateDataAttrs: Seq[Attribute], | ||
| initialState: LogicalPlan): LogicalPlan = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: init and initial. be consistent
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| val stateEncoder = encoderFor[S] | ||
|
|
||
| val mapped = new FlatMapGroupsWithState( | ||
| func, | ||
| UnresolvedDeserializer(encoderFor[K].deserializer, groupingAttributes), | ||
| UnresolvedDeserializer(encoderFor[V].deserializer, dataAttributes), | ||
| groupingAttributes, | ||
| dataAttributes, | ||
| CatalystSerde.generateObjAttr[U], | ||
| stateEncoder.asInstanceOf[ExpressionEncoder[Any]], | ||
| outputMode, | ||
| isMapGroupsWithState, | ||
| timeout, | ||
| hasInitialState = true, | ||
| initialStateGroupAttrs, | ||
| initialStateDataAttrs, | ||
| UnresolvedDeserializer(encoderFor[S].deserializer, initialStateDataAttrs), | ||
| initialState, | ||
| child) | ||
| CatalystSerde.serialize[U](mapped) | ||
| } | ||
|
|
@@ -474,6 +513,12 @@ object FlatMapGroupsWithState { | |
| * @param outputMode the output mode of `func` | ||
| * @param isMapGroupsWithState whether it is created by the `mapGroupsWithState` method | ||
| * @param timeout used to timeout groups that have not received data in a while | ||
| * @param hasInitialState Indicates whether initial state needs to be applied or not. | ||
| * @param initialStateGroupAttrs grouping attributes for the initial state | ||
| * @param initialStateDataAttrs used to read the initial state | ||
| * @param initialStateDeserializer used to extract the initial state objects. | ||
| * @param initialState user defined initial state that is applied in the first batch. | ||
| * @param child logical plan of the underlying data | ||
| */ | ||
| case class FlatMapGroupsWithState( | ||
| func: (Any, Iterator[Any], LogicalGroupState[Any]) => Iterator[Any], | ||
|
|
@@ -486,14 +531,24 @@ case class FlatMapGroupsWithState( | |
| outputMode: OutputMode, | ||
| isMapGroupsWithState: Boolean = false, | ||
| timeout: GroupStateTimeout, | ||
| child: LogicalPlan) extends UnaryNode with ObjectProducer { | ||
| hasInitialState: Boolean = false, | ||
| initialStateGroupAttrs: Seq[Attribute] = Seq.empty, | ||
|
||
| initialStateDataAttrs: Seq[Attribute] = Seq.empty, | ||
| initialStateDeserializer: Expression, | ||
| initialState: LogicalPlan, | ||
| child: LogicalPlan) extends BinaryNode with ObjectProducer { | ||
|
|
||
| if (isMapGroupsWithState) { | ||
| assert(outputMode == OutputMode.Update) | ||
| } | ||
|
|
||
| override protected def withNewChildInternal(newChild: LogicalPlan): FlatMapGroupsWithState = | ||
| copy(child = newChild) | ||
| override def left: LogicalPlan = child | ||
|
|
||
| override def right: LogicalPlan = initialState | ||
|
|
||
| override protected def withNewChildrenInternal( | ||
| newLeft: LogicalPlan, newRight: LogicalPlan): FlatMapGroupsWithState = | ||
| copy(child = newLeft, initialState = newRight) | ||
| } | ||
|
|
||
| /** Factory for constructing new `FlatMapGroupsInR` nodes. */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initial state is not supported in [flatMap|map]GroupsWithState operation on a batch DataFrame/Dataset