Skip to content

Commit a6c4970

Browse files
committed
Remove the sorting
1 parent 7ff1059 commit a6c4970

File tree

2 files changed

+8
-16
lines changed

2 files changed

+8
-16
lines changed

docs/structured-streaming-kafka-integration.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ title: Structured Streaming + Kafka Integration Guide (Kafka broker version 0.10
44
---
55

66
Structured Streaming integration for Kafka 0.10 to poll data from Kafka. It provides simple parallelism,
7-
1:1 correspondence between Kafka partitions and Spark partitions.
7+
1:1 correspondence between Kafka partitions and Spark partitions. The source will cache the Kafka
8+
consumer in executors and try the best to schedule the same Kafka topic partition to the same executor.
89

910
### Linking
1011
For Scala/Java applications using SBT/Maven project definitions, link your application with the following artifact:

external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -170,18 +170,18 @@ private[kafka010] case class KafkaSource(
170170

171171
// Use the until partitions to calculate offset ranges to ignore partitions that have
172172
// been deleted
173-
val sortedTopicPartitions = untilPartitionOffsets.keySet.filter { tp =>
173+
val topicPartitions = untilPartitionOffsets.keySet.filter { tp =>
174174
// Ignore partitions that we don't know the from offsets.
175175
newPartitionOffsets.contains(tp) || fromPartitionOffsets.contains(tp)
176-
}.toSeq.sorted(topicPartitionOrdering)
177-
logDebug("Sorted topicPartitions: " + sortedTopicPartitions.mkString(", "))
176+
}.toSeq
177+
logDebug("TopicPartitions: " + topicPartitions.mkString(", "))
178178

179179
val sortedExecutors = getSortedExecutorList(sc)
180180
val numExecutors = sortedExecutors.length
181181
logDebug("Sorted executors: " + sortedExecutors.mkString(", "))
182182

183183
// Calculate offset ranges
184-
val offsetRanges = sortedTopicPartitions.map { tp =>
184+
val offsetRanges = topicPartitions.map { tp =>
185185
val fromOffset = fromPartitionOffsets.get(tp).getOrElse {
186186
newPartitionOffsets.getOrElse(tp, {
187187
// This should not happen since newPartitionOffsets contains all partitions not in
@@ -191,6 +191,8 @@ private[kafka010] case class KafkaSource(
191191
}
192192
val untilOffset = untilPartitionOffsets(tp)
193193
val preferredLoc = if (numExecutors > 0) {
194+
// This allows cached KafkaConsumers in the executors to be re-used to read the same
195+
// partition in every batch.
194196
Some(sortedExecutors(floorMod(tp.hashCode, numExecutors)))
195197
} else None
196198
KafkaSourceRDDOffsetRange(tp, fromOffset, untilOffset, preferredLoc)
@@ -390,16 +392,5 @@ private[kafka010] object KafkaSource {
390392
if (a.host == b.host) { a.executorId > b.executorId } else { a.host > b.host }
391393
}
392394

393-
// Sort the partitions and current list of executors to consistently assign each partition
394-
// to the executor. This allows cached KafkaConsumers in the executors to be re-used to
395-
// read the same partition in every batch.
396-
private val topicPartitionOrdering = new Ordering[TopicPartition] {
397-
override def compare(l: TopicPartition, r: TopicPartition): Int = {
398-
implicitly[Ordering[(String, Long)]].compare(
399-
(l.topic, l.partition),
400-
(r.topic, r.partition))
401-
}
402-
}
403-
404395
private def floorMod(a: Long, b: Int): Int = ((a % b).toInt + b) % b
405396
}

0 commit comments

Comments
 (0)