kvstreamer: fix pathological behavior in InOrder mode

yuzefovich · yuzefovich · commit 6d68a6814b4f · 2024-11-06T09:33:01.000-08:00
This commit fixes the case of pathological behavior by the streamer in the InOrder mode in some cases. Namely, when ordering needs to be maintained, the streamer needs to prioritize sub-requests that have higher "urgency" to be served (i.e. those that are closer to the head of the line). This "urgency" is represented by the values in `singleRangeBatch.positions` slice where the smaller the value, the higher the urgency, and the value at the zeroth index is used as the priority for the whole single-range batch. It is assumed that the values in this slice are increasing, but this assumption could previously be violated when multiple ranges were touched (when the original batch fit within a single range, we have a separate fast-path that is unaffected by this bug). This was the case because we used `mustPreserveOrder = false` when instantiating the batch truncation helper. As a result, all sub-requests within the single-range batch would get reordered according to the start key of each request, and the original order wouldn't be restored by the batch truncation helper. This, in turn, would result in the streamer evaluating the requests with effectively random urgency which would then consume the working budget. In the extreme, we would use up all available budget for random requests, buffer them, and would keep on doing so until we get lucky to get the next head-of-the-line request randomly. This is now fixed by restoring the order of `positions` by the truncation helper when the streamer is in the InOrder mode. This commit also adds a test-only assertion for ensuring the ascending invariant is maintained. Here is a concrete example of the behavior. Say, we have two ranges [a - f) and [f - ...) and requests 0: Get(c) 1: Get(e) 2: Get(d) 3: Get(f) 4: Get(a) 5: Get(b) The batch truncation helper will first order all requests by the start key, so it'll process them in the order 4 - 5 - 0 - 2 - 1 - 3. When truncating to the first range [a - f), it'll populate `positions` as `[4, 5, 0, 2, 1]` (request 3 is outside of the range, so it'll stop). This slice is what we would previously include into `singleRangeBatch.positions`, so we would first evaluate the 4th request, then the 5th, etc. Previously, we would also incorrectly compare `singleRangeBatch`es between each other for "in order" priority. AFAICT this bug has been present since the introduction of the batch truncation helper in 645c154. The assumption of the InOrder mode was already there, in the comment, but wasn't enforced and was overlooked. Release note (bug fix): Previously, when executing queries with index / lookup joins when the ordering needs to be maintained, CockroachDB in some cases could get into a pathological behavior which would lead to increased query latency, possibly by several orders of magnitude. This bug was introduced in 22.2 and is now fixed.
diff --git a/pkg/kv/kvclient/kvstreamer/streamer.go b/pkg/kv/kvclient/kvstreamer/streamer.go
@@ -562,9 +562,11 @@ func (s *Streamer) Enqueue(ctx context.Context, reqs []kvpb.RequestUnion) (retEr
 		// ranges.
 		if s.truncationHelper == nil {
 			// The streamer can process the responses in an arbitrary order, so
-			// we don't require the helper to preserve the order of requests and
-			// allow it to reorder the reqs slice too.
-			const mustPreserveOrder = false
+			// we don't require the helper to preserve the order of requests,
+			// unless we're in the InOrder mode when we must maintain increasing
+			// positions. We unconditionally allow reordering of the reqs slice
+			// though.
+			var mustPreserveOrder = s.mode == InOrder
 			const canReorderRequestsSlice = true
 			s.truncationHelper, err = kvcoord.NewBatchTruncationHelper(
 				scanDir, reqs, mustPreserveOrder, canReorderRequestsSlice,
@@ -1343,6 +1345,20 @@ func (w *workerCoordinator) performRequestAsync(
 			ba.AdmissionHeader.NoMemoryReservedAtSource = false
 			ba.Requests = req.reqs
 
+			if buildutil.CrdbTestBuild {
+				if w.s.mode == InOrder {
+					for i := range req.positions[:len(req.positions)-1] {
+						if req.positions[i] >= req.positions[i+1] {
+							w.s.results.setError(errors.AssertionFailedf(
+								"positions aren't ascending: %d before %d at index %d",
+								req.positions[i], req.positions[i+1], i,
+							))
+							return
+						}
+					}
+				}
+			}
+
 			// TODO(yuzefovich): in Enqueue we split all requests into
 			// single-range batches, so ideally ba touches a single range in
 			// which case we hit the fast path in the DistSender. However, if
@@ -1770,6 +1786,9 @@ func buildResumeSingleRangeBatch(
 	// We've already reconciled the budget with the actual reservation for the
 	// requests with the ResumeSpans.
 	resumeReq.reqsReservedBytes = fp.resumeReqsMemUsage
+	// TODO(yuzefovich): add heuristic for making fresh allocation of slices
+	// whenever only a fraction of them will be used by the resume batch. This
+	// will allow us to return most of overheadAccountedFor to the budget.
 	resumeReq.overheadAccountedFor = req.overheadAccountedFor
 	// Note that due to limitations of the KV layer (#75452) we cannot reuse
 	// original requests because the KV doesn't allow mutability (and all
diff --git a/pkg/kv/kvclient/kvstreamer/streamer_test.go b/pkg/kv/kvclient/kvstreamer/streamer_test.go
@@ -690,3 +690,61 @@ ALTER TABLE t SPLIT AT SELECT generate_series(1, 30000, 3000);
 		}
 	}
 }
+
+// TestStreamerRandomAccess verifies that the Streamer handles the requests that
+// have random access pattern within ranges reasonably well. It is a regression
+// test for #133043.
+func TestStreamerRandomAccess(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	skip.UnderStress(t)
+	skip.UnderRace(t)
+
+	s, db, _ := serverutils.StartServer(t, base.TestServerArgs{})
+	defer s.Stopper().Stop(context.Background())
+
+	rng, _ := randutil.NewTestRand()
+	runner := sqlutils.MakeSQLRunner(db)
+	// Create a table with 3 ranges, with 2k rows in each. Each row is about
+	// 2.7KiB in size and has a random value in column 'v'.
+	runner.Exec(t, `
+CREATE TABLE t (
+  k INT PRIMARY KEY,
+  v INT,
+  blob STRING,
+  INDEX v_idx (v)
+);
+
+INSERT INTO t (k, v, blob) SELECT i, (random()*6000)::INT, repeat('a', 2700) FROM generate_series(1, 6000) AS g(i);
+
+ALTER TABLE t SPLIT AT SELECT i*2000 FROM generate_series(0, 2) AS g(i);
+`)
+
+	// The meat of the test - run the query that performs an index join to fetch
+	// all rows via the streamer, both in the OutOfOrder and InOrder modes, and
+	// with different workmem limits. Each time assert that the number of
+	// BatchRequests issued is relatively small (if not, then the streamer was
+	// extremely suboptimal).
+	kvGRPCCallsRegex := regexp.MustCompile(`KV gRPC calls: ([\d,]+)`)
+	for i := 0; i < 10; i++ {
+		// Pick random workmem limit in [2MiB; 16MiB] range.
+		workmem := 2<<20 + rng.Intn(14<<20)
+		runner.Exec(t, fmt.Sprintf("SET distsql_workmem = '%dB'", workmem))
+		for inOrder := range []bool{false, true} {
+			runner.Exec(t, `SET streamer_always_maintain_ordering = $1;`, inOrder)
+			gRPCCalls := -1
+			var err error
+			rows := runner.QueryStr(t, `EXPLAIN ANALYZE SELECT * FROM t@v_idx WHERE v > 0`)
+			for _, row := range rows {
+				if matches := kvGRPCCallsRegex.FindStringSubmatch(row[0]); len(matches) > 0 {
+					gRPCCalls, err = strconv.Atoi(strings.ReplaceAll(matches[1], ",", ""))
+					require.NoError(t, err)
+					break
+				}
+			}
+			require.Greater(t, gRPCCalls, 0, rows)
+			require.Greater(t, 150, gRPCCalls, rows)
+		}
+	}
+}