From ace18af891c9df175ec813138d78a59c628f5555 Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Mon, 29 Apr 2024 23:11:33 +0000 Subject: [PATCH 1/4] colexecerror: add benchmarks for CatchVectorizedRuntimeError Informs: #123235 Release note: None --- pkg/sql/colexecerror/BUILD.bazel | 15 ++- pkg/sql/colexecerror/error.go | 6 +- pkg/sql/colexecerror/error_test.go | 175 ++++++++++++++++++++++++++++- pkg/sql/colexecerror/main_test.go | 32 ++++++ pkg/sql/sem/builtins/builtins.go | 45 ++++++++ pkg/sql/sem/builtins/fixed_oids.go | 1 + pkg/testutils/lint/lint_test.go | 2 +- 7 files changed, 272 insertions(+), 4 deletions(-) create mode 100644 pkg/sql/colexecerror/main_test.go diff --git a/pkg/sql/colexecerror/BUILD.bazel b/pkg/sql/colexecerror/BUILD.bazel index e0ff03f78799..5fc8bb0cdbe9 100644 --- a/pkg/sql/colexecerror/BUILD.bazel +++ b/pkg/sql/colexecerror/BUILD.bazel @@ -15,11 +15,24 @@ go_library( go_test( name = "colexecerror_test", - srcs = ["error_test.go"], + srcs = [ + "error_test.go", + "main_test.go", + ], deps = [ ":colexecerror", + "//pkg/base", + "//pkg/security/securityassets", + "//pkg/security/securitytest", + "//pkg/server", + "//pkg/sql/pgwire/pgcode", + "//pkg/sql/pgwire/pgerror", + "//pkg/testutils/serverutils", + "//pkg/testutils/sqlutils", "//pkg/util/leaktest", "//pkg/util/log", + "//pkg/util/randutil", + "@com_github_cockroachdb_errors//:errors", "@com_github_stretchr_testify//require", ], ) diff --git a/pkg/sql/colexecerror/error.go b/pkg/sql/colexecerror/error.go index 78dce002bb72..c780c854a6fa 100644 --- a/pkg/sql/colexecerror/error.go +++ b/pkg/sql/colexecerror/error.go @@ -106,6 +106,9 @@ const ( sqlColPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/col" sqlRowPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/row" sqlSemPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/sem" + // When running BenchmarkCatchVectorizedRuntimeError under bazel, the + // repository prefix is missing. + testSqlColPackagesPrefix = "pkg/sql/col" ) // shouldCatchPanic checks whether the panic that was emitted from @@ -135,7 +138,8 @@ func shouldCatchPanic(panicEmittedFrom string) bool { strings.HasPrefix(panicEmittedFrom, execinfraPackagePrefix) || strings.HasPrefix(panicEmittedFrom, sqlColPackagesPrefix) || strings.HasPrefix(panicEmittedFrom, sqlRowPackagesPrefix) || - strings.HasPrefix(panicEmittedFrom, sqlSemPackagesPrefix) + strings.HasPrefix(panicEmittedFrom, sqlSemPackagesPrefix) || + strings.HasPrefix(panicEmittedFrom, testSqlColPackagesPrefix) } // StorageError is an error that was created by a component below the sql diff --git a/pkg/sql/colexecerror/error_test.go b/pkg/sql/colexecerror/error_test.go index 79c07c587606..7b7957c73783 100644 --- a/pkg/sql/colexecerror/error_test.go +++ b/pkg/sql/colexecerror/error_test.go @@ -11,13 +11,22 @@ package colexecerror_test import ( - "errors" + "context" + gosql "database/sql" + "fmt" + "runtime" "strings" "testing" + "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/sql/colexecerror" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -74,3 +83,167 @@ func TestNonCatchablePanicIsNotCaught(t *testing.T) { })) }) } + +// BenchmarkCatchVectorizedRuntimeError measures the time for +// CatchVectorizedRuntimeError to catch and process an error. +func BenchmarkCatchVectorizedRuntimeError(b *testing.B) { + err := errors.New("oops") + storageErr := colexecerror.NewStorageError(err) + pgErr := pgerror.WithCandidateCode(err, pgcode.Warning) + + cases := []struct { + name string + thrower func() + }{ + { + "noError", + func() {}, + }, + { + "expected", + func() { + colexecerror.ExpectedError(err) + }, + }, + { + "storage", + func() { + colexecerror.InternalError(storageErr) + }, + }, + { + "contextCanceled", + func() { + colexecerror.InternalError(context.Canceled) + }, + }, + { + "internalWithCode", + func() { + colexecerror.InternalError(pgErr) + }, + }, + { + "internal", + func() { + colexecerror.InternalError(err) + }, + }, + { + "runtime", + func() { + arr := []int{0, 1, 2} + _ = arr[3] + }, + }, + } + + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + _ = colexecerror.CatchVectorizedRuntimeError(tc.thrower) + } + }) + }) + } +} + +// BenchmarkSQLCatchVectorizedRuntimeError measures the time for +// CatchVectorizedRuntimeError to catch and process an error with a deeper stack +// than in BenchmarkCatchVectorizedRuntimeError. +func BenchmarkSQLCatchVectorizedRuntimeError(b *testing.B) { + defer leaktest.AfterTest(b)() + defer log.Scope(b).Close(b) + + cases := []struct { + name string + builtin string + }{ + { + "noError", + "crdb_internal.void_func()", + }, + { + "expectedWithCode", + "crdb_internal.force_error('01000', 'oops')", + }, + { + "expectedAssertion", + "crdb_internal.force_assertion_error('oops')", + }, + { + "internalAssertion", + "crdb_internal.force_panic('oops', 'internalAssertion')", + }, + { + "internalIndexOutOfRange", + "crdb_internal.force_panic('oops', 'indexOutOfRange')", + }, + { + "internalDivideByZero", + "crdb_internal.force_panic('oops', 'divideByZero')", + }, + { + "contextCanceled", + "crdb_internal.force_panic('oops', 'contextCanceled')", + }, + } + + // We execute this SELECT statement with various error-producing + // builtins. Ordering the projection this way creates a moderately deep stack + // with several nested calls to CatchVectorizedRuntimeError. + sqlFmt := `SELECT count(%s) OVER (), + 0, + '', + 0.0, + NULL, + '2000-01-01 00:00:00'::timestamptz, + b'00000000', + i + 0, + i * 1.5, + i / 100 + FROM generate_series(0, 0) AS s(i) +` + + ctx := context.Background() + s := serverutils.StartServerOnly(b, base.TestServerArgs{SQLMemoryPoolSize: 10 << 30}) + defer s.Stopper().Stop(ctx) + + for _, parallelism := range []int{1, 20, 50} { + numConns := runtime.GOMAXPROCS(0) * parallelism + b.Run(fmt.Sprintf("conns=%d", numConns), func(b *testing.B) { + for _, tc := range cases { + stmt := fmt.Sprintf(sqlFmt, tc.builtin) + b.Run(tc.name, func(b *testing.B) { + // Create as many warm connections as we will need for the benchmark. + conns := make(chan *gosql.DB, numConns) + for i := 0; i < numConns; i++ { + conn := s.ApplicationLayer().SQLConn(b, serverutils.DBName("")) + // Make sure we're using local, vectorized execution. + sqlDB := sqlutils.MakeSQLRunner(conn) + sqlDB.Exec(b, "SET distsql = off") + sqlDB.Exec(b, "SET vectorize = on") + // Warm up the connection by executing the statement once. We should + // always go through the query plan cache after this. + _, _ = conn.Exec(stmt) + conns <- conn + } + b.SetParallelism(parallelism) + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + var conn *gosql.DB + select { + case conn = <-conns: + default: + b.Fatal("not enough warm connections") + } + for pb.Next() { + _, _ = conn.Exec(stmt) + } + }) + }) + } + }) + } +} diff --git a/pkg/sql/colexecerror/main_test.go b/pkg/sql/colexecerror/main_test.go new file mode 100644 index 000000000000..6295dc259e2f --- /dev/null +++ b/pkg/sql/colexecerror/main_test.go @@ -0,0 +1,32 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package colexecerror_test + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/security/securityassets" + "github.com/cockroachdb/cockroach/pkg/security/securitytest" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/util/randutil" +) + +//go:generate ../../util/leaktest/add-leaktest.sh *_test.go + +func TestMain(m *testing.M) { + securityassets.SetLoader(securitytest.EmbeddedAssets) + randutil.SeedForTests() + serverutils.InitTestServerFactory(server.TestServerFactory) + + os.Exit(m.Run()) +} diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 34ad29e50e96..512ecff38fec 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -26,6 +26,7 @@ import ( "math/bits" "net" "regexp/syntax" + "strconv" "strings" "time" "unicode" @@ -5688,6 +5689,50 @@ SELECT Info: "This function is used only by CockroachDB's developers for testing purposes.", Volatility: volatility.Volatile, }, + tree.Overload{ + Types: tree.ParamTypes{{Name: "msg", Typ: types.String}, {Name: "mode", Typ: types.String}}, + ReturnType: tree.FixedReturnType(types.Int), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + // The user must have REPAIRCLUSTER to use this builtin. + if err := evalCtx.SessionAccessor.CheckPrivilege( + ctx, syntheticprivilege.GlobalPrivilegeObject, privilege.REPAIRCLUSTER, + ); err != nil { + return nil, err + } + + s, ok := tree.AsDString(args[0]) + if !ok { + return nil, errors.Newf("expected string value, got %T", args[0]) + } + msg := string(s) + mode, ok := tree.AsDString(args[1]) + if !ok { + return nil, errors.Newf("expected string value, got %T", args[1]) + } + switch string(mode) { + case "internalAssertion": + err := errors.AssertionFailedf("%s", msg) + // Panic instead of returning the error. The vectorized panic-catcher + // will catch the panic and convert it into an internal error. + colexecerror.InternalError(err) + case "indexOutOfRange": + msg += string(msg[math.MaxInt]) + case "divideByZero": + var foo []int + msg += strconv.Itoa(len(msg) / len(foo)) + case "contextCanceled": + panic(context.Canceled) + default: + return nil, errors.Newf( + "expected mode to be one of: internalAssertion, indexOutOfRange, divideByZero, contextCanceled", + ) + } + // This code is unreachable. + panic(msg) + }, + Info: "This function is used only by CockroachDB's developers for testing purposes.", + Volatility: volatility.Volatile, + }, ), "crdb_internal.force_log_fatal": makeBuiltin( diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 88eaee930f88..1a0af5d0481e 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2573,6 +2573,7 @@ var builtinOidsArray = []string{ 2605: `merge_aggregated_stmt_metadata(arg1: jsonb) -> jsonb`, 2606: `crdb_internal.protect_mvcc_history(timestamp: decimal, expiration_window: interval, description: string) -> int`, 2607: `crdb_internal.extend_mvcc_history_protection(job_id: int) -> void`, + 2608: `crdb_internal.force_panic(msg: string, mode: string) -> int`, } var builtinOidsBySignature map[string]oid.Oid diff --git a/pkg/testutils/lint/lint_test.go b/pkg/testutils/lint/lint_test.go index f333640480a9..30a967c4351d 100644 --- a/pkg/testutils/lint/lint_test.go +++ b/pkg/testutils/lint/lint_test.go @@ -2062,7 +2062,7 @@ func TestLint(t *testing.T) { // engine, don't forget to "register" the newly added package in // sql/colexecerror/error.go file. "sql/col*", - ":!sql/colexecerror/error.go", + ":!sql/colexecerror/error*.go", // This exception is because execgen itself uses panics during code // generation - not at execution time. The (glob,exclude) directive // (see git help gitglossary) makes * behave like a normal, single dir From 81d1ae940fe47d31b1e5efbf3d1c813ab11b3a74 Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Mon, 29 Apr 2024 23:32:21 +0000 Subject: [PATCH 2/4] colexecerror: check for expected errors before examining stack Informs: #123235 Release note: None --- pkg/sql/colexecerror/error.go | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pkg/sql/colexecerror/error.go b/pkg/sql/colexecerror/error.go index c780c854a6fa..ebe37a01fec9 100644 --- a/pkg/sql/colexecerror/error.go +++ b/pkg/sql/colexecerror/error.go @@ -36,6 +36,21 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) { return } + // First check for error types that should only come from the vectorized + // engine. + if err, ok := panicObj.(error); ok { + // StorageError was caused by something below SQL, and represents an error + // that we'd simply like to propagate along. + var se *StorageError + // notInternalError is an error that will be returned to the client + // without a stacktrace, sentry report, or "internal error" designation. + var nie *notInternalError + if errors.As(err, &se) || errors.As(err, &nie) { + retErr = err + return + } + } + // Find where the panic came from and only proceed if it is related to the // vectorized engine. stackTrace := string(debug.Stack()) @@ -66,20 +81,11 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) { } retErr = err - if _, ok := panicObj.(*StorageError); ok { - // A StorageError was caused by something below SQL, and represents - // an error that we'd simply like to propagate along. - // Do nothing. - return - } - annotateErrorWithoutCode := true - var nie *notInternalError - if errors.Is(err, context.Canceled) || errors.As(err, &nie) { - // We don't want to annotate the context cancellation and - // notInternalError errors in case they don't have a valid PG code - // so that the sentry report is not sent (errors with failed - // assertions get sentry reports). + if errors.Is(err, context.Canceled) { + // We don't want to annotate the context cancellation errors in case they + // don't have a valid PG code so that the sentry report is not sent + // (errors with failed assertions get sentry reports). annotateErrorWithoutCode = false } if code := pgerror.GetPGCode(err); annotateErrorWithoutCode && code == pgcode.Uncategorized { From ba91895dd8da74e23c1ab47a1bb51de04e8afffd Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Wed, 1 May 2024 08:35:56 +0000 Subject: [PATCH 3/4] colexecerror: check for internal errors before examining stack Informs: #123235 Release note: None --- pkg/sql/colexecerror/error.go | 66 ++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/pkg/sql/colexecerror/error.go b/pkg/sql/colexecerror/error.go index ebe37a01fec9..349dec93c348 100644 --- a/pkg/sql/colexecerror/error.go +++ b/pkg/sql/colexecerror/error.go @@ -42,17 +42,42 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) { // StorageError was caused by something below SQL, and represents an error // that we'd simply like to propagate along. var se *StorageError - // notInternalError is an error that will be returned to the client - // without a stacktrace, sentry report, or "internal error" designation. + // notInternalError is an error from the vectorized engine that we'd + // simply like to propagate along. var nie *notInternalError - if errors.As(err, &se) || errors.As(err, &nie) { + // internalError is an error from the vectorized engine that might need to + // be returned to the client with a stacktrace, sentry report, and + // "internal error" designation. + var ie *internalError + passthrough := errors.As(err, &se) || errors.As(err, &nie) + if errors.As(err, &ie) { + // Unwrap so that internalError doesn't show up in sentry reports. + retErr = ie.Unwrap() + // If the internal error doesn't already have an error code, mark it as + // an assertion error so that we generate a sentry report. (We don't do + // this for StorageError, notInternalError, or context.Canceled to avoid + // creating unnecessary sentry reports.) + if !passthrough && !errors.Is(retErr, context.Canceled) { + if code := pgerror.GetPGCode(retErr); code == pgcode.Uncategorized { + retErr = errors.NewAssertionErrorWithWrappedErrf( + retErr, "unexpected error from the vectorized engine", + ) + } + } + return + } + if passthrough { retErr = err return } } - // Find where the panic came from and only proceed if it is related to the - // vectorized engine. + // For other types of errors, we need to check where the panic came from. We + // only want to recover from panics that originated within the vectorized + // engine. We treat a panic from lower in the stack as unrecoverable. + + // Find where the panic came from and only proceed if it + // is related to the vectorized engine. stackTrace := string(debug.Stack()) scanner := bufio.NewScanner(strings.NewReader(stackTrace)) panicLineFound := false @@ -193,16 +218,41 @@ func decodeNotInternalError( return newNotInternalError(cause) } +// internalError is an error that occurs because the vectorized engine is in an +// unexpected state. Usually it wraps an assertion error. +type internalError struct { + cause error +} + +func newInternalError(err error) *internalError { + return &internalError{cause: err} +} + +var ( + _ errors.Wrapper = &internalError{} +) + +func (e *internalError) Error() string { return e.cause.Error() } +func (e *internalError) Cause() error { return e.cause } +func (e *internalError) Unwrap() error { return e.Cause() } + +func decodeInternalError( + _ context.Context, cause error, _ string, _ []string, _ proto.Message, +) error { + return newInternalError(cause) +} + func init() { errors.RegisterWrapperDecoder(errors.GetTypeKey((*notInternalError)(nil)), decodeNotInternalError) + errors.RegisterWrapperDecoder(errors.GetTypeKey((*internalError)(nil)), decodeInternalError) } -// InternalError simply panics with the provided object. It will always be -// caught and returned as internal error to the client with the corresponding +// InternalError panics with the error wrapped by internalError. It will always +// be caught and returned as internal error to the client with the corresponding // stack trace. This method should be called to propagate errors that resulted // in the vectorized engine being in an *unexpected* state. func InternalError(err error) { - panic(err) + panic(newInternalError(err)) } // ExpectedError panics with the error that is wrapped by From 879751eb8f367f1dc8be9f97c62272c3c5bb8328 Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Wed, 1 May 2024 08:36:57 +0000 Subject: [PATCH 4/4] colexecerror: use runtime.Callers instead of debug.Stack Fixes: #123235 Release note (performance improvement): Make error handling in the vectorized execution engine much cheaper. This should help avoid bad metastable regimes perpetuated by statement timeout handling consuming all CPU time, leading to more statement timeouts. --- pkg/sql/colexecerror/error.go | 56 +++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/pkg/sql/colexecerror/error.go b/pkg/sql/colexecerror/error.go index 349dec93c348..2f977523d870 100644 --- a/pkg/sql/colexecerror/error.go +++ b/pkg/sql/colexecerror/error.go @@ -11,9 +11,8 @@ package colexecerror import ( - "bufio" "context" - "runtime/debug" + "runtime" "strings" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" @@ -72,29 +71,42 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) { } } - // For other types of errors, we need to check where the panic came from. We - // only want to recover from panics that originated within the vectorized - // engine. We treat a panic from lower in the stack as unrecoverable. - - // Find where the panic came from and only proceed if it - // is related to the vectorized engine. - stackTrace := string(debug.Stack()) - scanner := bufio.NewScanner(strings.NewReader(stackTrace)) - panicLineFound := false - for scanner.Scan() { - if strings.Contains(scanner.Text(), panicLineSubstring) { - panicLineFound = true - break + // For other types of errors, we need to check whence the panic originated + // to know what to do. If the panic originated in the vectorized engine, we + // can safely return it as a normal error knowing that any illegal state + // will be cleaned up when the statement finishes. If the panic originated + // lower in the stack, however, we must treat it as unrecoverable because it + // could indicate an illegal state that might persist even after this + // statement finishes. + // + // To check whence the panic originated, we find the frame just before the + // panic frame. + var panicLineFound bool + var panicEmittedFrom string + // We should be able to find it within 3 program counters, starting with the + // caller of this deferred function (2 above the runtime.Callers frame). + pc := make([]uintptr, 3) + n := runtime.Callers(2, pc) + if n >= 1 { + frames := runtime.CallersFrames(pc[:n]) + // A fixed number of program counters can expand to any number of frames. + for { + frame, more := frames.Next() + if strings.Contains(frame.File, panicLineSubstring) { + panicLineFound = true + } else if panicLineFound { + panicEmittedFrom = frame.Function + break + } + if !more { + break + } } } - if !panicLineFound { - panic(errors.AssertionFailedf("panic line %q not found in the stack trace\n%s", panicLineSubstring, stackTrace)) - } - if !scanner.Scan() { - panic(errors.AssertionFailedf("unexpectedly there is no line below the panic line in the stack trace\n%s", stackTrace)) - } - panicEmittedFrom := strings.TrimSpace(scanner.Text()) if !shouldCatchPanic(panicEmittedFrom) { + // The panic is from outside the vectorized engine (or we didn't find it + // in the stack). We treat it as unrecoverable because it could indicate + // an illegal state that might persist even after this statement finishes. panic(panicObj) }