Skip to content

Commit dfbf379

Browse files
authored
Yolo26 post-process on the parser side (#278)
* (N, 300, 6) layout for depthai-nodes * parser-side topK and conf thresholding instead of tools-side * Support for instance seg * Added OBB support * skip num_keypoints_check for yolo26 since it's handled separately * Remove OBB support, decoding in utils to make it cleaner * remove validation, transpose, shape errors * pre-commit * Try infer everything from nnarchive (seg pose detection mode) and one yolo26 subtyp * merge decoding logic into one for yolo26 * parse_kpts is the same for previous yolo versions and 26 * Typo in my comment fixed + output shapes for masks and pose
1 parent 89818d7 commit dfbf379

File tree

2 files changed

+296
-63
lines changed

2 files changed

+296
-63
lines changed

depthai_nodes/node/parsers/utils/yolo.py

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class YOLOSubtype(str, Enum):
2727
V8 = "yolov8"
2828
V9 = "yolov9"
2929
V10 = "yolov10"
30+
V26 = "yolo26"
3031
P = "yolo-p"
3132
GOLD = "yolo-gold"
3233
DEFAULT = ""
@@ -308,7 +309,7 @@ def parse_yolo_output(
308309

309310
def parse_kpts(
310311
kpts: np.ndarray, n_keypoints: int, img_shape: Tuple[int, int]
311-
) -> List[Tuple[int, int, float]]:
312+
) -> List[Tuple[float, float, float]]:
312313
"""Parse keypoints.
313314
314315
@param kpts: Result keypoints.
@@ -318,7 +319,7 @@ def parse_kpts(
318319
@param img_shape: Image shape of the model input in (height, width) format.
319320
@type img_shape: Tuple[int, int]
320321
@return: Parsed keypoints.
321-
@rtype: List[Tuple[int, int, float]]
322+
@rtype: List[Tuple[float, float, float]]
322323
"""
323324
h, w = img_shape
324325
kps = []
@@ -330,6 +331,100 @@ def parse_kpts(
330331
return kps
331332

332333

334+
def _apply_conf_and_topk(
335+
boxes: np.ndarray,
336+
scores: np.ndarray,
337+
conf_threshold: float,
338+
max_det: int,
339+
auxiliary: Optional[np.ndarray] = None,
340+
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
341+
"""Apply confidence threshold and top-k filtering.
342+
343+
@param boxes: Bounding boxes array (A, 4).
344+
@type boxes: np.ndarray
345+
@param scores: Class scores array (A, nc).
346+
@type scores: np.ndarray
347+
@param conf_threshold: Confidence threshold.
348+
@type conf_threshold: float
349+
@param max_det: Maximum number of detections.
350+
@type max_det: int
351+
@param auxiliary: generic parameter for task-specific data (mask coefficients for
352+
segmentation and keypoints for pose) to be filtered according to the detections
353+
@type auxiliary: Optional[np.ndarray]
354+
@return: Tuple of (results array (K, 6), filtered auxiliary or None).
355+
@rtype: Tuple[np.ndarray, Optional[np.ndarray]]
356+
"""
357+
cls_ids = scores.argmax(axis=-1).astype(np.float32)
358+
cls_scores = scores.max(axis=-1)
359+
keep = cls_scores >= conf_threshold
360+
361+
if not np.any(keep):
362+
aux_shape = (0,) + auxiliary.shape[1:] if auxiliary is not None else None
363+
return np.zeros((0, 6), dtype=np.float32), (
364+
np.zeros(aux_shape, dtype=np.float32) if aux_shape else None
365+
)
366+
367+
keep_indices = np.where(keep)[0]
368+
boxes = boxes[keep]
369+
cls_scores = cls_scores[keep]
370+
cls_ids = cls_ids[keep]
371+
aux_kept = auxiliary[keep_indices] if auxiliary is not None else None
372+
373+
k = min(max_det, cls_scores.shape[0])
374+
if cls_scores.shape[0] > k:
375+
topk_idx = np.argpartition(-cls_scores, k - 1)[:k]
376+
order = np.argsort(-cls_scores[topk_idx])
377+
topk_idx = topk_idx[order]
378+
else:
379+
topk_idx = np.argsort(-cls_scores)
380+
381+
boxes = boxes[topk_idx]
382+
cls_scores = cls_scores[topk_idx]
383+
cls_ids = cls_ids[topk_idx]
384+
aux_kept = aux_kept[topk_idx] if aux_kept is not None else None
385+
386+
results = np.concatenate(
387+
[boxes, cls_scores[:, None], cls_ids[:, None]], axis=1
388+
).astype(np.float32)
389+
390+
return results, aux_kept
391+
392+
393+
def decode_yolo26(
394+
raw: np.ndarray,
395+
conf_threshold: float,
396+
max_det: int,
397+
extra_raw: Optional[np.ndarray] = None,
398+
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
399+
"""Decode YOLO26 output for detection, segmentation, or pose.
400+
401+
YOLO26 end2end output is already decoded (xyxy in pixels) but needs topk and conf
402+
thresholding. Optionally filters an auxiliary tensor (mask coefficients or
403+
keypoints) with the detections
404+
405+
@param raw: Raw detection tensor (N, A, 4+nc).
406+
@type raw: np.ndarray
407+
@param conf_threshold: Confidence threshold.
408+
@type conf_threshold: float
409+
@param max_det: Maximum number of detections.
410+
@type max_det: int
411+
@param extra_raw: Optional auxiliary tensor (N, A, M) such as mask coefficients or
412+
keypoints. When provided the kept rows are returned as the second element.
413+
@type extra_raw: Optional[np.ndarray]
414+
@return: Tuple of (detection results (K, 6), kept auxiliary data (K, M) or None).
415+
@rtype: Tuple[np.ndarray, Optional[np.ndarray]]
416+
"""
417+
det_results = raw[0] # (A, 4+nc)
418+
extra = extra_raw[0] if extra_raw is not None else None
419+
420+
boxes = det_results[:, :4]
421+
scores = det_results[:, 4:]
422+
results, kept_extra = _apply_conf_and_topk(
423+
boxes, scores, conf_threshold, max_det, extra
424+
)
425+
return results, kept_extra
426+
427+
333428
def decode_yolo_output(
334429
yolo_outputs: List[np.ndarray],
335430
strides: List[int],

0 commit comments

Comments
 (0)