diff --git a/pkg/sql/colexecerror/BUILD.bazel b/pkg/sql/colexecerror/BUILD.bazel index e0ff03f78799..5fc8bb0cdbe9 100644 --- a/pkg/sql/colexecerror/BUILD.bazel +++ b/pkg/sql/colexecerror/BUILD.bazel @@ -15,11 +15,24 @@ go_library( go_test( name = "colexecerror_test", - srcs = ["error_test.go"], + srcs = [ + "error_test.go", + "main_test.go", + ], deps = [ ":colexecerror", + "//pkg/base", + "//pkg/security/securityassets", + "//pkg/security/securitytest", + "//pkg/server", + "//pkg/sql/pgwire/pgcode", + "//pkg/sql/pgwire/pgerror", + "//pkg/testutils/serverutils", + "//pkg/testutils/sqlutils", "//pkg/util/leaktest", "//pkg/util/log", + "//pkg/util/randutil", + "@com_github_cockroachdb_errors//:errors", "@com_github_stretchr_testify//require", ], ) diff --git a/pkg/sql/colexecerror/error.go b/pkg/sql/colexecerror/error.go index 78dce002bb72..2f977523d870 100644 --- a/pkg/sql/colexecerror/error.go +++ b/pkg/sql/colexecerror/error.go @@ -11,9 +11,8 @@ package colexecerror import ( - "bufio" "context" - "runtime/debug" + "runtime" "strings" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" @@ -36,25 +35,78 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) { return } - // Find where the panic came from and only proceed if it is related to the - // vectorized engine. - stackTrace := string(debug.Stack()) - scanner := bufio.NewScanner(strings.NewReader(stackTrace)) - panicLineFound := false - for scanner.Scan() { - if strings.Contains(scanner.Text(), panicLineSubstring) { - panicLineFound = true - break + // First check for error types that should only come from the vectorized + // engine. + if err, ok := panicObj.(error); ok { + // StorageError was caused by something below SQL, and represents an error + // that we'd simply like to propagate along. + var se *StorageError + // notInternalError is an error from the vectorized engine that we'd + // simply like to propagate along. + var nie *notInternalError + // internalError is an error from the vectorized engine that might need to + // be returned to the client with a stacktrace, sentry report, and + // "internal error" designation. + var ie *internalError + passthrough := errors.As(err, &se) || errors.As(err, &nie) + if errors.As(err, &ie) { + // Unwrap so that internalError doesn't show up in sentry reports. + retErr = ie.Unwrap() + // If the internal error doesn't already have an error code, mark it as + // an assertion error so that we generate a sentry report. (We don't do + // this for StorageError, notInternalError, or context.Canceled to avoid + // creating unnecessary sentry reports.) + if !passthrough && !errors.Is(retErr, context.Canceled) { + if code := pgerror.GetPGCode(retErr); code == pgcode.Uncategorized { + retErr = errors.NewAssertionErrorWithWrappedErrf( + retErr, "unexpected error from the vectorized engine", + ) + } + } + return + } + if passthrough { + retErr = err + return } } - if !panicLineFound { - panic(errors.AssertionFailedf("panic line %q not found in the stack trace\n%s", panicLineSubstring, stackTrace)) - } - if !scanner.Scan() { - panic(errors.AssertionFailedf("unexpectedly there is no line below the panic line in the stack trace\n%s", stackTrace)) + + // For other types of errors, we need to check whence the panic originated + // to know what to do. If the panic originated in the vectorized engine, we + // can safely return it as a normal error knowing that any illegal state + // will be cleaned up when the statement finishes. If the panic originated + // lower in the stack, however, we must treat it as unrecoverable because it + // could indicate an illegal state that might persist even after this + // statement finishes. + // + // To check whence the panic originated, we find the frame just before the + // panic frame. + var panicLineFound bool + var panicEmittedFrom string + // We should be able to find it within 3 program counters, starting with the + // caller of this deferred function (2 above the runtime.Callers frame). + pc := make([]uintptr, 3) + n := runtime.Callers(2, pc) + if n >= 1 { + frames := runtime.CallersFrames(pc[:n]) + // A fixed number of program counters can expand to any number of frames. + for { + frame, more := frames.Next() + if strings.Contains(frame.File, panicLineSubstring) { + panicLineFound = true + } else if panicLineFound { + panicEmittedFrom = frame.Function + break + } + if !more { + break + } + } } - panicEmittedFrom := strings.TrimSpace(scanner.Text()) if !shouldCatchPanic(panicEmittedFrom) { + // The panic is from outside the vectorized engine (or we didn't find it + // in the stack). We treat it as unrecoverable because it could indicate + // an illegal state that might persist even after this statement finishes. panic(panicObj) } @@ -66,20 +118,11 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) { } retErr = err - if _, ok := panicObj.(*StorageError); ok { - // A StorageError was caused by something below SQL, and represents - // an error that we'd simply like to propagate along. - // Do nothing. - return - } - annotateErrorWithoutCode := true - var nie *notInternalError - if errors.Is(err, context.Canceled) || errors.As(err, &nie) { - // We don't want to annotate the context cancellation and - // notInternalError errors in case they don't have a valid PG code - // so that the sentry report is not sent (errors with failed - // assertions get sentry reports). + if errors.Is(err, context.Canceled) { + // We don't want to annotate the context cancellation errors in case they + // don't have a valid PG code so that the sentry report is not sent + // (errors with failed assertions get sentry reports). annotateErrorWithoutCode = false } if code := pgerror.GetPGCode(err); annotateErrorWithoutCode && code == pgcode.Uncategorized { @@ -106,6 +149,9 @@ const ( sqlColPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/col" sqlRowPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/row" sqlSemPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/sem" + // When running BenchmarkCatchVectorizedRuntimeError under bazel, the + // repository prefix is missing. + testSqlColPackagesPrefix = "pkg/sql/col" ) // shouldCatchPanic checks whether the panic that was emitted from @@ -135,7 +181,8 @@ func shouldCatchPanic(panicEmittedFrom string) bool { strings.HasPrefix(panicEmittedFrom, execinfraPackagePrefix) || strings.HasPrefix(panicEmittedFrom, sqlColPackagesPrefix) || strings.HasPrefix(panicEmittedFrom, sqlRowPackagesPrefix) || - strings.HasPrefix(panicEmittedFrom, sqlSemPackagesPrefix) + strings.HasPrefix(panicEmittedFrom, sqlSemPackagesPrefix) || + strings.HasPrefix(panicEmittedFrom, testSqlColPackagesPrefix) } // StorageError is an error that was created by a component below the sql @@ -183,16 +230,41 @@ func decodeNotInternalError( return newNotInternalError(cause) } +// internalError is an error that occurs because the vectorized engine is in an +// unexpected state. Usually it wraps an assertion error. +type internalError struct { + cause error +} + +func newInternalError(err error) *internalError { + return &internalError{cause: err} +} + +var ( + _ errors.Wrapper = &internalError{} +) + +func (e *internalError) Error() string { return e.cause.Error() } +func (e *internalError) Cause() error { return e.cause } +func (e *internalError) Unwrap() error { return e.Cause() } + +func decodeInternalError( + _ context.Context, cause error, _ string, _ []string, _ proto.Message, +) error { + return newInternalError(cause) +} + func init() { errors.RegisterWrapperDecoder(errors.GetTypeKey((*notInternalError)(nil)), decodeNotInternalError) + errors.RegisterWrapperDecoder(errors.GetTypeKey((*internalError)(nil)), decodeInternalError) } -// InternalError simply panics with the provided object. It will always be -// caught and returned as internal error to the client with the corresponding +// InternalError panics with the error wrapped by internalError. It will always +// be caught and returned as internal error to the client with the corresponding // stack trace. This method should be called to propagate errors that resulted // in the vectorized engine being in an *unexpected* state. func InternalError(err error) { - panic(err) + panic(newInternalError(err)) } // ExpectedError panics with the error that is wrapped by diff --git a/pkg/sql/colexecerror/error_test.go b/pkg/sql/colexecerror/error_test.go index 79c07c587606..7b7957c73783 100644 --- a/pkg/sql/colexecerror/error_test.go +++ b/pkg/sql/colexecerror/error_test.go @@ -11,13 +11,22 @@ package colexecerror_test import ( - "errors" + "context" + gosql "database/sql" + "fmt" + "runtime" "strings" "testing" + "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/sql/colexecerror" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -74,3 +83,167 @@ func TestNonCatchablePanicIsNotCaught(t *testing.T) { })) }) } + +// BenchmarkCatchVectorizedRuntimeError measures the time for +// CatchVectorizedRuntimeError to catch and process an error. +func BenchmarkCatchVectorizedRuntimeError(b *testing.B) { + err := errors.New("oops") + storageErr := colexecerror.NewStorageError(err) + pgErr := pgerror.WithCandidateCode(err, pgcode.Warning) + + cases := []struct { + name string + thrower func() + }{ + { + "noError", + func() {}, + }, + { + "expected", + func() { + colexecerror.ExpectedError(err) + }, + }, + { + "storage", + func() { + colexecerror.InternalError(storageErr) + }, + }, + { + "contextCanceled", + func() { + colexecerror.InternalError(context.Canceled) + }, + }, + { + "internalWithCode", + func() { + colexecerror.InternalError(pgErr) + }, + }, + { + "internal", + func() { + colexecerror.InternalError(err) + }, + }, + { + "runtime", + func() { + arr := []int{0, 1, 2} + _ = arr[3] + }, + }, + } + + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + _ = colexecerror.CatchVectorizedRuntimeError(tc.thrower) + } + }) + }) + } +} + +// BenchmarkSQLCatchVectorizedRuntimeError measures the time for +// CatchVectorizedRuntimeError to catch and process an error with a deeper stack +// than in BenchmarkCatchVectorizedRuntimeError. +func BenchmarkSQLCatchVectorizedRuntimeError(b *testing.B) { + defer leaktest.AfterTest(b)() + defer log.Scope(b).Close(b) + + cases := []struct { + name string + builtin string + }{ + { + "noError", + "crdb_internal.void_func()", + }, + { + "expectedWithCode", + "crdb_internal.force_error('01000', 'oops')", + }, + { + "expectedAssertion", + "crdb_internal.force_assertion_error('oops')", + }, + { + "internalAssertion", + "crdb_internal.force_panic('oops', 'internalAssertion')", + }, + { + "internalIndexOutOfRange", + "crdb_internal.force_panic('oops', 'indexOutOfRange')", + }, + { + "internalDivideByZero", + "crdb_internal.force_panic('oops', 'divideByZero')", + }, + { + "contextCanceled", + "crdb_internal.force_panic('oops', 'contextCanceled')", + }, + } + + // We execute this SELECT statement with various error-producing + // builtins. Ordering the projection this way creates a moderately deep stack + // with several nested calls to CatchVectorizedRuntimeError. + sqlFmt := `SELECT count(%s) OVER (), + 0, + '', + 0.0, + NULL, + '2000-01-01 00:00:00'::timestamptz, + b'00000000', + i + 0, + i * 1.5, + i / 100 + FROM generate_series(0, 0) AS s(i) +` + + ctx := context.Background() + s := serverutils.StartServerOnly(b, base.TestServerArgs{SQLMemoryPoolSize: 10 << 30}) + defer s.Stopper().Stop(ctx) + + for _, parallelism := range []int{1, 20, 50} { + numConns := runtime.GOMAXPROCS(0) * parallelism + b.Run(fmt.Sprintf("conns=%d", numConns), func(b *testing.B) { + for _, tc := range cases { + stmt := fmt.Sprintf(sqlFmt, tc.builtin) + b.Run(tc.name, func(b *testing.B) { + // Create as many warm connections as we will need for the benchmark. + conns := make(chan *gosql.DB, numConns) + for i := 0; i < numConns; i++ { + conn := s.ApplicationLayer().SQLConn(b, serverutils.DBName("")) + // Make sure we're using local, vectorized execution. + sqlDB := sqlutils.MakeSQLRunner(conn) + sqlDB.Exec(b, "SET distsql = off") + sqlDB.Exec(b, "SET vectorize = on") + // Warm up the connection by executing the statement once. We should + // always go through the query plan cache after this. + _, _ = conn.Exec(stmt) + conns <- conn + } + b.SetParallelism(parallelism) + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + var conn *gosql.DB + select { + case conn = <-conns: + default: + b.Fatal("not enough warm connections") + } + for pb.Next() { + _, _ = conn.Exec(stmt) + } + }) + }) + } + }) + } +} diff --git a/pkg/sql/colexecerror/main_test.go b/pkg/sql/colexecerror/main_test.go new file mode 100644 index 000000000000..6295dc259e2f --- /dev/null +++ b/pkg/sql/colexecerror/main_test.go @@ -0,0 +1,32 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package colexecerror_test + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/security/securityassets" + "github.com/cockroachdb/cockroach/pkg/security/securitytest" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/util/randutil" +) + +//go:generate ../../util/leaktest/add-leaktest.sh *_test.go + +func TestMain(m *testing.M) { + securityassets.SetLoader(securitytest.EmbeddedAssets) + randutil.SeedForTests() + serverutils.InitTestServerFactory(server.TestServerFactory) + + os.Exit(m.Run()) +} diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 34ad29e50e96..512ecff38fec 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -26,6 +26,7 @@ import ( "math/bits" "net" "regexp/syntax" + "strconv" "strings" "time" "unicode" @@ -5688,6 +5689,50 @@ SELECT Info: "This function is used only by CockroachDB's developers for testing purposes.", Volatility: volatility.Volatile, }, + tree.Overload{ + Types: tree.ParamTypes{{Name: "msg", Typ: types.String}, {Name: "mode", Typ: types.String}}, + ReturnType: tree.FixedReturnType(types.Int), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + // The user must have REPAIRCLUSTER to use this builtin. + if err := evalCtx.SessionAccessor.CheckPrivilege( + ctx, syntheticprivilege.GlobalPrivilegeObject, privilege.REPAIRCLUSTER, + ); err != nil { + return nil, err + } + + s, ok := tree.AsDString(args[0]) + if !ok { + return nil, errors.Newf("expected string value, got %T", args[0]) + } + msg := string(s) + mode, ok := tree.AsDString(args[1]) + if !ok { + return nil, errors.Newf("expected string value, got %T", args[1]) + } + switch string(mode) { + case "internalAssertion": + err := errors.AssertionFailedf("%s", msg) + // Panic instead of returning the error. The vectorized panic-catcher + // will catch the panic and convert it into an internal error. + colexecerror.InternalError(err) + case "indexOutOfRange": + msg += string(msg[math.MaxInt]) + case "divideByZero": + var foo []int + msg += strconv.Itoa(len(msg) / len(foo)) + case "contextCanceled": + panic(context.Canceled) + default: + return nil, errors.Newf( + "expected mode to be one of: internalAssertion, indexOutOfRange, divideByZero, contextCanceled", + ) + } + // This code is unreachable. + panic(msg) + }, + Info: "This function is used only by CockroachDB's developers for testing purposes.", + Volatility: volatility.Volatile, + }, ), "crdb_internal.force_log_fatal": makeBuiltin( diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 88eaee930f88..1a0af5d0481e 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2573,6 +2573,7 @@ var builtinOidsArray = []string{ 2605: `merge_aggregated_stmt_metadata(arg1: jsonb) -> jsonb`, 2606: `crdb_internal.protect_mvcc_history(timestamp: decimal, expiration_window: interval, description: string) -> int`, 2607: `crdb_internal.extend_mvcc_history_protection(job_id: int) -> void`, + 2608: `crdb_internal.force_panic(msg: string, mode: string) -> int`, } var builtinOidsBySignature map[string]oid.Oid diff --git a/pkg/testutils/lint/lint_test.go b/pkg/testutils/lint/lint_test.go index f333640480a9..30a967c4351d 100644 --- a/pkg/testutils/lint/lint_test.go +++ b/pkg/testutils/lint/lint_test.go @@ -2062,7 +2062,7 @@ func TestLint(t *testing.T) { // engine, don't forget to "register" the newly added package in // sql/colexecerror/error.go file. "sql/col*", - ":!sql/colexecerror/error.go", + ":!sql/colexecerror/error*.go", // This exception is because execgen itself uses panics during code // generation - not at execution time. The (glob,exclude) directive // (see git help gitglossary) makes * behave like a normal, single dir