cilium · ti-mo · Mar 13, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
@@ -56,6 +56,12 @@ const minVerifierLogSize = 64 * 1024
 // future, but avoid the unnecessary EINVAL for now.
 const maxVerifierLogSize = math.MaxUint32 >> 2
 
+// maxVerifierAttempts is the maximum number of times the verifier will retry
+// loading a program with a growing log buffer before giving up. Since we double
+// the log size on every attempt, this is the absolute maximum number of
+// attempts before the buffer reaches [maxVerifierLogSize].
+const maxVerifierAttempts = 30
+
 // ProgramOptions control loading a program into the kernel.
 type ProgramOptions struct {
 	// Bitmap controlling the detail emitted by the kernel's eBPF verifier log.
@@ -418,59 +424,44 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er
 		return nil, fmt.Errorf("log level: %w", internal.ErrNotSupportedOnOS)
 	}
 
-	// The caller requested a specific verifier log level. Set up the log buffer
-	// so that there is a chance of loading the program in a single shot.
-	logSize := internal.Between(opts.LogSizeStart, minVerifierLogSize, maxVerifierLogSize)
 	var logBuf []byte
-	if !opts.LogDisabled && opts.LogLevel != 0 {
-		logBuf = make([]byte, logSize)
-		attr.LogLevel = opts.LogLevel
-		attr.LogSize = uint32(len(logBuf))
-		attr.LogBuf = sys.SlicePointer(logBuf)
-	}
-
-	for {
-		var fd *sys.FD
+	var fd *sys.FD
+	if opts.LogDisabled {
+		// Loading with logging disabled should never retry.
 		fd, err = sys.ProgLoad(attr)
 		if err == nil {
-			return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil
+			return &Program{"", fd, spec.Name, "", spec.Type}, nil
 		}
-
-		if opts.LogDisabled {
-			break
-		}
-
-		if attr.LogTrueSize != 0 && attr.LogSize >= attr.LogTrueSize {
-			// The log buffer already has the correct size.
-			break
+	} else {
+		// Only specify log size if log level is also specified. Setting size
+		// without level results in EINVAL. Level will be bumped to LogLevelBranch
+		// if the first load fails.
+		if opts.LogLevel != 0 {
+			attr.LogLevel = opts.LogLevel
+			attr.LogSize = internal.Between(opts.LogSizeStart, minVerifierLogSize, maxVerifierLogSize)
 		}
 
-		if attr.LogSize != 0 && !errors.Is(err, unix.ENOSPC) {
-			// Logging is enabled and the error is not ENOSPC, so we can infer
-			// that the log buffer is large enough.
-			break
-		}
+		attempts := 1
+		for {
+			if attr.LogLevel != 0 {
+				logBuf = make([]byte, attr.LogSize)
+				attr.LogBuf = sys.SlicePointer(logBuf)
+			}
 
-		if attr.LogLevel == 0 {
-			// Logging is not enabled but loading the program failed. Enable
-			// basic logging.
-			attr.LogLevel = LogLevelBranch
-		}
+			fd, err = sys.ProgLoad(attr)
+			if err == nil {
+				return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil
+			}
 
-		// Make an educated guess how large the buffer should be by multiplying.
-		// Ensure the size doesn't overflow.
-		const factor = 2
-		logSize = internal.Between(logSize, minVerifierLogSize, maxVerifierLogSize/factor)
-		logSize *= factor
+			if !retryLogAttrs(attr, opts.LogSizeStart, err) {
+				break
+			}
 
-		if attr.LogTrueSize != 0 {
-			// The kernel has given us a hint how large the log buffer has to be.
-			logSize = attr.LogTrueSize
+			if attempts >= maxVerifierAttempts {
+				return nil, fmt.Errorf("load program: %w (bug: hit %d verifier attempts)", err, maxVerifierAttempts)
+			}
+			attempts++
 		}
-
-		logBuf = make([]byte, logSize)
-		attr.LogSize = logSize
-		attr.LogBuf = sys.SlicePointer(logBuf)
 	}
 
 	end := bytes.IndexByte(logBuf, 0)
@@ -520,6 +511,50 @@ func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er
 	return nil, internal.ErrorWithLog("load program", err, logBuf)
 }
 
+func retryLogAttrs(attr *sys.ProgLoadAttr, startSize uint32, err error) bool {
+	if attr.LogSize == maxVerifierLogSize {
+		// Maximum buffer size reached, don't grow or retry.
+		return false
+	}
+
+	// ENOSPC means the log was enabled on the previous iteration, so we only
+	// need to grow the buffer.
+	if errors.Is(err, unix.ENOSPC) {
+		if attr.LogTrueSize != 0 {
+			// Kernel supports LogTrueSize and previous iteration undershot the buffer
+			// size. Try again with the given true size.
+			attr.LogSize = attr.LogTrueSize
+			return true
+		}
+
+		// Ensure the size doesn't overflow.
+		const factor = 2
+		if attr.LogSize >= maxVerifierLogSize/factor {
+			attr.LogSize = maxVerifierLogSize
+			return true
+		}
+
+		// Make an educated guess how large the buffer should be by multiplying. Due
+		// to int division, this rounds down odd sizes.
+		attr.LogSize = internal.Between(attr.LogSize, minVerifierLogSize, maxVerifierLogSize/factor)
+		attr.LogSize *= factor
+
+		return true
+	}
+
+	if attr.LogLevel == 0 {
+		// Loading the program failed, it wasn't a buffer-related error, and the log
+		// was disabled the previous iteration. Enable basic logging and retry.
+		attr.LogLevel = LogLevelBranch
+		attr.LogSize = internal.Between(startSize, minVerifierLogSize, maxVerifierLogSize)
+		return true
+	}
+
+	// Loading the program failed for a reason other than buffer size and the log
+	// was already enabled the previous iteration. Don't retry.
+	return false
+}
+
 // NewProgramFromFD creates a program from a raw fd.
 //
 // You should not use fd after calling this function.

@@ -372,6 +372,95 @@ func TestProgramVerifierLog(t *testing.T) {
 	qt.Assert(t, qt.Not(qt.HasLen(prog.VerifierLog, 0)))
 }
 
+func TestProgramVerifierLogRetry(t *testing.T) {
+	someError := errors.New("not a buffer error")
+
+	t.Run("retry with oversized buffer, no log_true_size", func(t *testing.T) {
+		// First load failure, without logging enabled. Retry with logging enabled.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, someError)))
+		qt.Assert(t, qt.Equals(attr.LogLevel, LogLevelBranch))
+		qt.Assert(t, qt.Equals(attr.LogSize, minVerifierLogSize))
+
+		// Second failure with logging enabled. No buffer error, don't retry.
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, someError)))
+		qt.Assert(t, qt.Equals(attr.LogLevel, LogLevelBranch))
+		qt.Assert(t, qt.Equals(attr.LogSize, minVerifierLogSize))
+	})
+
+	t.Run("retry with oversized buffer, with log_true_size", func(t *testing.T) {
+		// First load failure, without logging enabled. Retry with larger buffer.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, someError)))
+
+		// Buffer was sufficiently large and log_true_size was set. Don't retry and
+		// don't modify LogSize to LogTrueSize.
+		attr.LogTrueSize = 123
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, someError)))
+		qt.Assert(t, qt.Equals(attr.LogSize, minVerifierLogSize))
+	})
+
+	t.Run("retry with undersized buffer, no log_true_size", func(t *testing.T) {
+		// First load failure, without logging enabled. Retry with larger buffer.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, someError)))
+
+		// Second failure, this time the kernel signals an undersized buffer. Retry
+		// with double the size.
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, unix.ENOSPC)))
+		qt.Assert(t, qt.Equals(attr.LogSize, minVerifierLogSize*2))
+	})
+
+	t.Run("retry with undersized buffer, with log_true_size", func(t *testing.T) {
+		// First load failure, without logging enabled. Retry with larger buffer.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, someError)))
+
+		// Second failure, the kernel signals undersized buffer and also sets
+		// log_true_size. Retry with the exact size required.
+		attr.LogTrueSize = 123
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, unix.ENOSPC)))
+		qt.Assert(t, qt.Equals(attr.LogSize, 123))
+	})
+
+	t.Run("grow to maximum buffer size", func(t *testing.T) {
+		// Previous loads pushed the log size to (or above) half of the maximum,
+		// which would make it overflow on the next retry. Make sure the log size
+		// actually hits the maximum so we can bail out.
+		attr := &sys.ProgLoadAttr{LogLevel: LogLevelBranch, LogSize: maxVerifierLogSize / 2}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, 0, unix.ENOSPC)))
+		qt.Assert(t, qt.Equals(attr.LogSize, maxVerifierLogSize))
+
+		// Don't retry if the buffer is already at the maximum size, no matter
+		// the return code.
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, someError)))
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, unix.ENOSPC)))
+	})
+
+	t.Run("start at maximum buffer size", func(t *testing.T) {
+		// The user requested a log buffer exceeding the maximum size, but no log
+		// level. Retry with the maximum size and default log level.
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		qt.Assert(t, qt.IsTrue(retryLogAttrs(attr, math.MaxUint32, unix.EINVAL)))
+		qt.Assert(t, qt.Equals(attr.LogLevel, LogLevelBranch))
+		qt.Assert(t, qt.Equals(attr.LogSize, maxVerifierLogSize))
+
+		// Log still doesn't fit maximum-size buffer. Don't retry.
+		qt.Assert(t, qt.IsFalse(retryLogAttrs(attr, 0, unix.ENOSPC)))
+	})
+
+	t.Run("ensure growth terminates within max attempts", func(t *testing.T) {
+		attr := &sys.ProgLoadAttr{LogLevel: 0, LogSize: 0}
+		var terminated bool
+		for i := 1; i <= maxVerifierAttempts; i++ {
+			if !retryLogAttrs(attr, 0, syscall.ENOSPC) {
+				terminated = true
+			}
+		}
+		qt.Assert(t, qt.IsTrue(terminated))
+	})
+}
+
 func TestProgramWithUnsatisfiedMap(t *testing.T) {
 	coll, err := LoadCollectionSpec("testdata/loader-el.elf")
 	if err != nil {