deepstream使用yolo-Seg遇到Myelin (Division by 0 detected in the shape graph. “sp__mye3“ is equal to 0“

本文链接：https://blog.csdn.net/qq_45667330/article/details/147483296

之前在deepstream6.3/7.0里使用yolo-segv8/v5模型的时候都遇到了上述的错误：问题发生在帧中未检测到任何物体时，导致 TensorRT 形状图中出现除以零的结果，TensorRT会报错：

ERROR: [TRT]: 1: [runner.cpp::shapeChangeHelper::621] Error Code 1: Myelin (Division by 0 detected in the shape graph. Tensor (Divisor) "sp__mye3" is equal to 0.; ) ERROR: Failed to enqueue trt inference batch ERROR: Infer context enqueue buffer failed, nvinfer error:NVDSINFER_TENSORRT_ERROR 0:07:59.483589952 19370 0xaaab062aa5e0 WARN nvinfer gstnvinfer.cpp:1418:gst_nvinfer_input_queue_loop:<primary_gie> error: Failed to queue input batch for inferencing

浏览了Nvidia相关开发网站后，发现工程师回复说这是TensorRT当前版本里的bug，后续版本会更正，可两年了这个bug依旧未被消除。

因此我在pt文件生成onnx之前，更改ultralytics里的export_yoloV8_seg.py，在代码将未检测到物体的数目从0变成1，并将被识别的图像左上角加一个class类别超出范围的伪目标。

以确保：

NMS 函数始终返回至少一个检测
如果没有发现真正的检测，则添加具有特定特征的虚拟检测
虚拟检测在处理过程中会被过滤掉，因此不会影响实际结果
代码始终保持至少一次检测以避免除以零的错误

这样跑起来正常不会出现Error，而且由于labels.txt里不包含这个类别也不会被绘制到图像上。问题得以解决。

1.修改NMS.forward方法本身，需要确保它始终返回至少一个检测结果。代码修改方法如下：

class NMS(torch.autograd.Function):
    @staticmethod
    def forward(self, boxes, scores, max_output_boxes_per_class=100, iou_threshold=0.45, score_threshold=0.25):
        device = boxes.device
        batch = scores.shape[0]
        num_det = random.randint(0, 100)
        
        # Always ensure at least one detection is returned
        if num_det == 0:
            num_det = 1
            
        batches = torch.randint(0, batch, (num_det,)).sort()[0].to(device)
        idxs = torch.arange(100, 100 + num_det).to(device)
        zeros = torch.zeros((num_det,), dtype=torch.int64).to(device)
        selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], 0).T.contiguous()
        selected_indices = selected_indices.to(torch.int64)
        
        return selected_indices

2.修改DeepStreamOutput.forward方法以处理添加伪目标检测的情况。

class DeepStreamOutput(nn.Module):
    def __init__(self, nc, conf_thres=0.25, iou_thres=0.45, max_det=100):
        self.nc = nc
        self.conf_thres = conf_thres
        self.iou_thres = iou_thres
        self.max_det = max_det
        super().__init__()

    def forward(self, x):
        preds = x[0].transpose(1, 2)
        boxes = preds[:, :, :4]
        scores, classes = torch.max(preds[:, :, 4:self.nc+4], 2, keepdim=True)
        classes = classes.float()
        masks = preds[:, :, self.nc+4:]
        protos = x[1]

        # Add a dummy detection with specific characteristics if no detections meet the threshold
        batch_size = boxes.shape[0]
        for b in range(batch_size):
            # Check if any scores in this batch meet the threshold
            if not torch.any(scores[b] > self.conf_thres):
                # Create a dummy detection with specific values
                # Use index 0 as our dummy object
                boxes[b, 0, :] = torch.tensor([1.0, 1.0, 2.0, 2.0], device=boxes.device)
                scores[b, 0, 0] = 1.0  # Set a high confidence score
                classes[b, 0, 0] = float(self.nc)  # 设置为超出正常类别范围的类别索引
        
        convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype,
                                      device=boxes.device)

        boxes = boxes @ convert_matrix

        selected_indices = NMS.apply(boxes, scores.transpose(1, 2).contiguous(), self.max_det, self.conf_thres,
                                     self.iou_thres)

        b, c, mh, mw = protos.shape
        n = selected_indices.shape[0]

        batch_index = selected_indices[:, 0]
        box_index = selected_indices[:, 2]

        selected_boxes = boxes[batch_index, box_index, :]
        selected_scores = scores[batch_index, box_index, :]
        selected_classes = classes[batch_index, box_index, :]
        selected_masks = masks[batch_index, box_index, :]
        
        # Filter out any dummy detections we added
        valid_mask = ~((selected_boxes[:, 0] == 1.0) & 
                       (selected_boxes[:, 1] == 1.0) & 
                       (selected_boxes[:, 2] == 2.0) & 
                       (selected_boxes[:, 3] == 2.0) & 
                       (selected_scores[:, 0] == 1.0))
        
        # If all detections were filtered (meaning we only had dummies),
        # keep one dummy to avoid division by zero errors
        if not torch.any(valid_mask) and n > 0:
            valid_mask[0] = True
            
        # Apply the filtering
        if torch.any(~valid_mask):
            selected_boxes = selected_boxes[valid_mask]
            selected_scores = selected_scores[valid_mask]
            selected_classes = selected_classes[valid_mask]
            selected_masks = selected_masks[valid_mask]
            batch_index = batch_index[valid_mask]
            n = valid_mask.sum()

        pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 'half_pixel', 'avg', int(mh), int(mw), 0, 0.25)

        masks_protos = selected_masks.unsqueeze(dim=1) @ pooled_proto.float().view(n, c, mh * mw)
        masks_protos = masks_protos.sigmoid().view(-1, mh * mw)

        dets = torch.cat([selected_boxes, selected_scores, selected_classes, masks_protos], dim=1)

        batched_dets = dets.unsqueeze(0).repeat(b, 1, 1)
        batch_template = torch.arange(0, b, dtype=batch_index.dtype, device=batch_index.device).unsqueeze(1)
        batched_dets = batched_dets.where((batch_index == batch_template).unsqueeze(-1), batched_dets.new_zeros(1))

        y, i = batched_dets.shape[1:]

        final_dets = batched_dets.new_zeros((b, self.max_det, i))
        final_dets[:, :y, :] = batched_dets

        final_boxes = final_dets[:, :, :4]
        final_scores = final_dets[:, :, 4:5]
        final_classes = final_dets[:, :, 5:6]
        final_masks = final_dets[:, :, 6:]

        final_masks = final_masks.view(b, -1, mh, mw)

        return final_boxes, final_scores, final_classes, final_masks

export_yoloV8_seg.py全部代码如下：

import os
import sys
import random
import argparse
import warnings
import onnx
import torch
import torch.nn as nn
from copy import deepcopy
from ultralytics import YOLO
from ultralytics.utils.torch_utils import select_device
from ultralytics.nn.modules import C2f, Detect, RTDETRDecoder


class RoiAlign(torch.autograd.Function):
    @staticmethod
    def forward(self, X, rois, batch_indices, coordinate_transformation_mode='half_pixel', mode='avg', output_height=160,
                output_width=160, sampling_ratio=0, spatial_scale=0.25):
        N, C, H, W = X.shape
        num_rois = rois.shape[0]
        return torch.randn((num_rois, C, output_height, output_width), device=rois.device, dtype=rois.dtype)

    @staticmethod
    def symbolic(g, X, rois, batch_indices, coordinate_transformation_mode='half_pixel', mode='avg', output_height=160,
                 output_width=160, sampling_ratio=0, spatial_scale=0.25):
        return g.op("RoiAlign", X, rois, batch_indices, coordinate_transformation_mode_s=coordinate_transformation_mode,
                    mode_s=mode, output_height_i=output_height, output_width_i=output_width, sampling_ratio_i=sampling_ratio,
                    spatial_scale_f=spatial_scale)


class NMS(torch.autograd.Function):
    @staticmethod
    def forward(self, boxes, scores, max_output_boxes_per_class=100, iou_threshold=0.45, score_threshold=0.25):
        device = boxes.device
        batch = scores.shape[0]
        num_det = random.randint(0, 100)
        # Always ensure at least one detection is returned
        if num_det == 0:
            num_det = 1
        batches = torch.randint(0, batch, (num_det,)).sort()[0].to(device)
        idxs = torch.arange(100, 100 + num_det).to(device)
        zeros = torch.zeros((num_det,), dtype=torch.int64).to(device)
        selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], 0).T.contiguous()
        selected_indices = selected_indices.to(torch.int64)
        return selected_indices

    @staticmethod
    def symbolic(g, boxes, scores, max_output_boxes_per_class=100, iou_threshold=0.45, score_threshold=0.25):
        return g.op("NonMaxSuppression", boxes, scores, torch.tensor([max_output_boxes_per_class]),
                    torch.tensor([iou_threshold]), torch.tensor([score_threshold]), center_point_box_i=0)


class DeepStreamOutput(nn.Module):
    def __init__(self, nc, conf_thres=0.25, iou_thres=0.45, max_det=100):
        self.nc = nc
        self.conf_thres = conf_thres
        self.iou_thres = iou_thres
        self.max_det = max_det
        super().__init__()

    def forward(self, x):
        preds = x[0].transpose(1, 2)
        boxes = preds[:, :, :4]
        scores, classes = torch.max(preds[:, :, 4:self.nc+4], 2, keepdim=True)
        classes = classes.float()
        masks = preds[:, :, self.nc+4:]
        protos = x[1]

        convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype,
                                      device=boxes.device)
        
        # Add a dummy detection with specific characteristics if no detections meet the threshold
        batch_size = boxes.shape[0]
        for b in range(batch_size):
            # Check if any scores in this batch meet the threshold
            if not torch.any(scores[b] > self.conf_thres):
                # Create a dummy detection with specific values
                # Use index 0 as our dummy object
                boxes[b, 0, :] = torch.tensor([1.0, 1.0, 2.0, 2.0], device=boxes.device)
                scores[b, 0, 0] = 1.0  # Set a high confidence score
                classes[b, 0, 0] = float(self.nc)  # 设置为超出正常类别范围的类别索引
        
        convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype,
                                      device=boxes.device)

        boxes = boxes @ convert_matrix

        selected_indices = NMS.apply(boxes, scores.transpose(1, 2).contiguous(), self.max_det, self.conf_thres,
                                     self.iou_thres)

        b, c, mh, mw = protos.shape
        n = selected_indices.shape[0]

        batch_index = selected_indices[:, 0]
        box_index = selected_indices[:, 2]

        selected_boxes = boxes[batch_index, box_index, :]
        selected_scores = scores[batch_index, box_index, :]
        selected_classes = classes[batch_index, box_index, :]
        selected_masks = masks[batch_index, box_index, :]
        
        # Filter out any dummy detections we added
        valid_mask = ~((selected_boxes[:, 0] == 1.0) & 
                       (selected_boxes[:, 1] == 1.0) & 
                       (selected_boxes[:, 2] == 2.0) & 
                       (selected_boxes[:, 3] == 2.0) & 
                       (selected_scores[:, 0] == 1.0))
        
        # If all detections were filtered (meaning we only had dummies),
        # keep one dummy to avoid division by zero errors
        if not torch.any(valid_mask) and n > 0:
            valid_mask[0] = True
            
        # Apply the filtering
        if torch.any(~valid_mask):
            selected_boxes = selected_boxes[valid_mask]
            selected_scores = selected_scores[valid_mask]
            selected_classes = selected_classes[valid_mask]
            selected_masks = selected_masks[valid_mask]
            batch_index = batch_index[valid_mask]
            n = valid_mask.sum()

        pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 'half_pixel', 'avg', int(mh), int(mw), 0, 0.25)

        masks_protos = selected_masks.unsqueeze(dim=1) @ pooled_proto.float().view(n, c, mh * mw)
        masks_protos = masks_protos.sigmoid().view(-1, mh * mw)

        dets = torch.cat([selected_boxes, selected_scores, selected_classes, masks_protos], dim=1)

        batched_dets = dets.unsqueeze(0).repeat(b, 1, 1)
        batch_template = torch.arange(0, b, dtype=batch_index.dtype, device=batch_index.device).unsqueeze(1)
        batched_dets = batched_dets.where((batch_index == batch_template).unsqueeze(-1), batched_dets.new_zeros(1))

        y, i = batched_dets.shape[1:]

        final_dets = batched_dets.new_zeros((b, self.max_det, i))
        final_dets[:, :y, :] = batched_dets

        final_boxes = final_dets[:, :, :4]
        final_scores = final_dets[:, :, 4:5]
        final_classes = final_dets[:, :, 5:6]
        final_masks = final_dets[:, :, 6:]

        final_masks = final_masks.view(b, -1, mh, mw)

        return final_boxes, final_scores, final_classes, final_masks


def suppress_warnings():
    warnings.filterwarnings('ignore', category=torch.jit.TracerWarning)
    warnings.filterwarnings('ignore', category=UserWarning)
    warnings.filterwarnings('ignore', category=DeprecationWarning)


def yolov8_export(weights, device):
    model = YOLO(weights)
    model = deepcopy(model.model).to(device)
    for p in model.parameters():
        p.requires_grad = False
    model.eval()
    model.float()
    model = model.fuse()
    for k, m in model.named_modules():
        if isinstance(m, (Detect, RTDETRDecoder)):
            m.dynamic = False
            m.export = True
            m.format = 'onnx'
        elif isinstance(m, C2f):
            m.forward = m.forward_split
    return model


def main(args):
    suppress_warnings()

    print('\nStarting: %s' % args.weights)

    print('Opening YOLOv8-Seg model\n')

    device = select_device('cpu')
    model = yolov8_export(args.weights, device)

    if len(model.names.keys()) > 0:
        print('\nCreating labels.txt file')
        f = open('labels.txt', 'w')
        for name in model.names.values():
            f.write(name + '\n')
        f.close()

    model = nn.Sequential(model, DeepStreamOutput(len(model.names), args.conf_thres, args.iou_thres, args.max_det))

    img_size = args.size * 2 if len(args.size) == 1 else args.size

    onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
    onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx'

    dynamic_axes = {
        'input': {
            0: 'batch'
        },
        'boxes': {
            0: 'batch'
        },
        'scores': {
            0: 'batch'
        },
        'classes': {
            0: 'batch'
        },
        'masks': {
            0: 'batch'
        }
    }

    print('\nExporting the model to ONNX')
    torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset,
                      do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes', 'masks'],
                      dynamic_axes=dynamic_axes if args.dynamic else None)

    if args.simplify:
        print('Simplifying the ONNX model')
        import onnxsim
        model_onnx = onnx.load(onnx_output_file)
        model_onnx, _ = onnxsim.simplify(model_onnx)
        onnx.save(model_onnx, onnx_output_file)

    print('Done: %s\n' % onnx_output_file)


def parse_args():
    parser = argparse.ArgumentParser(description='DeepStream YOLOv8-Seg conversion')
    parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)')
    parser.add_argument('-s', '--size', nargs='+', type=int, default=[640], help='Inference size [H,W] (default [640])')
    parser.add_argument('--opset', type=int, default=16, help='ONNX opset version')
    parser.add_argument('--simplify', action='store_true', help='ONNX simplify model')
    parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size')
    parser.add_argument('--batch', type=int, default=1, help='Static batch-size')
    parser.add_argument('--conf-thres', type=float, default=0.25, help='Minimum confidence threshold')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
    parser.add_argument('--max-det', type=int, default=100, help='Maximum detections')
    args = parser.parse_args()
    if not os.path.isfile(args.weights):
        raise SystemExit('Invalid weights file')
    if args.dynamic and args.batch > 1:
        raise SystemExit('Cannot set dynamic batch-size and static batch-size at same time')
    return args


if __name__ == '__main__':
    args = parse_args()
    sys.exit(main(args))

使用此代码转换成的onnx生成trt engine后再运行，尽管视野里无目标但也没有异常退出