Skip to content

Commit d4327d1

Browse files
authored
Merge pull request #399 from OdedViner/skip_subvol_get_info_err
csi: skip not-ready cephfs subvolumes in ls and summarize errors
2 parents 37fe7de + 6883e5c commit d4327d1

1 file changed

Lines changed: 71 additions & 5 deletions

File tree

pkg/filesystem/subvolume.go

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24+
osexec "os/exec"
2425
"strconv"
2526
"strings"
2627
"syscall"
@@ -250,6 +251,9 @@ func listCephFSSubvolumes(ctx context.Context, clientsets *k8sutil.Clientsets, o
250251
}
251252
fmt.Println("Filesystem Subvolume SubvolumeGroup State")
252253

254+
// collect subvolumes that are not ready (EAGAIN) to show a summary at the end
255+
var notReadyErrors []string
256+
253257
// this iterates over the filesystems and subvolumegroup to get the list of subvolumes that exist
254258
for _, fs := range fsstruct {
255259
// gets the subvolumegroup in the filesystem
@@ -276,10 +280,9 @@ func listCephFSSubvolumes(ctx context.Context, clientsets *k8sutil.Clientsets, o
276280
// subvolume info returns error in case of pending clone or if it is not ready
277281
// it is suggested to delete the pvc before deleting the subvolume.
278282
if err != nil {
279-
if errors.Is(err, syscall.EAGAIN) {
280-
logging.Warning("Found pending clone: %q", sv.Name)
281-
logging.Warning("Please delete the pending pv if any before deleting the subvolume %s", sv.Name)
282-
logging.Warning("To avoid stale resources, please scale down the cephfs deployment before deleting the subvolume.")
283+
if isSubvolumeNotReady(err) {
284+
// Skip listing this subvolume but remember to report it later
285+
notReadyErrors = append(notReadyErrors, fmt.Sprintf("%s/%s/%s: %v", fs.Name, svg.Name, sv.Name, err))
283286
continue
284287
}
285288
logging.Fatal(fmt.Errorf("failed to get subvolume state: %q %q", sv.Name, err))
@@ -317,6 +320,15 @@ func listCephFSSubvolumes(ctx context.Context, clientsets *k8sutil.Clientsets, o
317320
}
318321
}
319322
}
323+
324+
// After listing, show a concise summary of skipped subvolumes due to not-ready state
325+
if len(notReadyErrors) > 0 {
326+
logging.Warning("%d subvolumes were skipped because they are not ready (pending clone or in progress):", len(notReadyErrors))
327+
for _, errMsg := range notReadyErrors {
328+
logging.Warning(" %s", errMsg)
329+
}
330+
logging.Warning("To avoid stale resources, you may scale down the cephfs deployment before deleting such subvolumes.")
331+
}
320332
}
321333

322334
// getSubvolumeState returns the state of the subvolume
@@ -326,7 +338,8 @@ func getSubvolumeState(ctx context.Context, clientsets *k8sutil.Clientsets, oper
326338

327339
subVolumeInfo, errvol := runCommand(ctx, clientsets, operatorNamespace, clusterNamespace, cmd, args)
328340
if errvol != nil {
329-
logging.Error(errvol, "failed to get subvolume info")
341+
// Avoid fmt EXTRA artifacts by formatting the error ourselves
342+
logging.Error(fmt.Errorf("failed to get subvolume info for %s/%s/%s: %v", fsName, SubvolumeGroup, SubVol, errvol))
330343
return "", errvol
331344
}
332345
var info map[string]interface{}
@@ -341,6 +354,59 @@ func getSubvolumeState(ctx context.Context, clientsets *k8sutil.Clientsets, oper
341354
return state, nil
342355
}
343356

357+
// exitCodeFromError extracts exit code from wrapped errors if possible.
358+
func exitCodeFromError(err error) (int, bool) {
359+
// errno-style errors
360+
var errno syscall.Errno
361+
if errors.As(err, &errno) {
362+
return int(errno), true
363+
}
364+
// errors with ExitStatus()
365+
type exitStatuser interface{ ExitStatus() int }
366+
var es exitStatuser
367+
if errors.As(err, &es) {
368+
return es.ExitStatus(), true
369+
}
370+
// os/exec errors
371+
var ee *osexec.ExitError
372+
if errors.As(err, &ee) {
373+
if status, ok := ee.Sys().(syscall.WaitStatus); ok {
374+
return status.ExitStatus(), true
375+
}
376+
}
377+
return 0, false
378+
}
379+
380+
// isSubvolumeNotReady detects Ceph EAGAIN "not ready" conditions
381+
func isSubvolumeNotReady(err error) bool {
382+
if err == nil {
383+
return false
384+
}
385+
386+
// Best check: proper error type match
387+
if errors.Is(err, syscall.EAGAIN) {
388+
return true
389+
}
390+
391+
// Next best: exit codes 11 or -11 (common Ceph EAGAIN)
392+
if code, ok := exitCodeFromError(err); ok {
393+
if code == 11 || code == -11 {
394+
return true
395+
}
396+
}
397+
398+
// Fallback: minimal string detection
399+
msg := err.Error()
400+
if strings.Contains(msg, "EAGAIN") {
401+
return true
402+
}
403+
if strings.Contains(msg, "exit code 11") || strings.Contains(msg, "exit status 11") || strings.Contains(msg, "exit status -11") {
404+
return true
405+
}
406+
407+
return false
408+
}
409+
344410
// gets list of filesystem
345411
func getFileSystem(ctx context.Context, clientsets *k8sutil.Clientsets, operatorNamespace, clusterNamespace string) ([]fsStruct, error) {
346412

0 commit comments

Comments
 (0)