之前在deepstream6.3/7.0里使用yolo-segv8/v5模型的时候都遇到了上述的错误:问题发生在帧中未检测到任何物体时,导致 TensorRT 形状图中出现除以零的结果,TensorRT会报错:
ERROR: [TRT]: 1: [runner.cpp::shapeChangeHelper::621] Error Code 1: Myelin (Division by 0 detected in the shape graph. Tensor (Divisor) "sp__mye3" is equal to 0.; ) ERROR: Failed to enqueue trt inference batch ERROR: Infer context enqueue buffer failed, nvinfer error:NVDSINFER_TENSORRT_ERROR 0:07:59.483589952 19370 0xaaab062aa5e0 WARN nvinfer gstnvinfer.cpp:1418:gst_nvinfer_input_queue_loop:<primary_gie> error: Failed to queue input batch for inferencing
浏览了Nvidia相关开发网站后,发现工程师回复说这是TensorRT当前版本里的bug,后续版本会更正,可两年了这个bug依旧未被消除。
因此我在pt文件生成onnx之前,更改ultralytics里的export_yoloV8_seg.py,在代码将未检测到物体的数目从0变成1,并将被识别的图像左上角加一个class类别超出范围的伪目标。
以确保:
- NMS 函数始终返回至少一个检测
- 如果没有发现真正的检测,则添加具有特定特征的虚拟检测
- 虚拟检测在处理过程中会被过滤掉,因此不会影响实际结果
- 代码始终保持至少一次检测以避免除以零的错误
这样跑起来正常不会出现Error,而且由于labels.txt里不包含这个类别也不会被绘制到图像上。问题得以解决。
1.修改NMS.forward
方法本身,需要确保它始终返回至少一个检测结果。代码修改方法如下:
class NMS(torch.autograd.Function):
@staticmethod
def forward(self, boxes, scores, max_output_boxes_per_class=100, iou_threshold=0.45, score_threshold=0.25):
device = boxes.device
batch = scores.shape[0]
num_det = random.randint(0, 100)
# Always ensure at least one detection is returned
if num_det == 0:
num_det = 1
batches = torch.randint(0, batch, (num_det,)).sort()[0].to(device)
idxs = torch.arange(100, 100 + num_det).to(device)
zeros = torch.zeros((num_det,), dtype=torch.int64).to(device)
selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], 0).T.contiguous()
selected_indices = selected_indices.to(torch.int64)
return selected_indices
2.修改DeepStreamOutput.forward
方法以处理添加伪目标检测的情况。
class DeepStreamOutput(nn.Module):
def __init__(self, nc, conf_thres=0.25, iou_thres=0.45, max_det=100):
self.nc = nc
self.conf_thres = conf_thres
self.iou_thres = iou_thres
self.max_det = max_det
super().__init__()
def forward(self, x):
preds = x[0].transpose(1, 2)
boxes = preds[:, :, :4]
scores, classes = torch.max(preds[:, :, 4:self.nc+4], 2, keepdim=True)
classes = classes.float()
masks = preds[:, :, self.nc+4:]
protos = x[1]
# Add a dummy detection with specific characteristics if no detections meet the threshold
batch_size = boxes.shape[0]
for b in range(batch_size):
# Check if any scores in this batch meet the threshold
if not torch.any(scores[b] > self.conf_thres):
# Create a dummy detection with specific values
# Use index 0 as our dummy object
boxes[b, 0, :] = torch.tensor([1.0, 1.0, 2.0, 2.0], device=boxes.device)
scores[b, 0, 0] = 1.0 # Set a high confidence score
classes[b, 0, 0] = float(self.nc) # 设置为超出正常类别范围的类别索引
convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype,
device=boxes.device)
boxes = boxes @ convert_matrix
selected_indices = NMS.apply(boxes, scores.transpose(1, 2).contiguous(), self.max_det, self.conf_thres,
self.iou_thres)
b, c, mh, mw = protos.shape
n = selected_indices.shape[0]
batch_index = selected_indices[:, 0]
box_index = selected_indices[:, 2]
selected_boxes = boxes[batch_index, box_index, :]
selected_scores = scores[batch_index, box_index, :]
selected_classes = classes[batch_index, box_index, :]
selected_masks = masks[batch_index, box_index, :]
# Filter out any dummy detections we added
valid_mask = ~((selected_boxes[:, 0] == 1.0) &
(selected_boxes[:, 1] == 1.0) &
(selected_boxes[:, 2] == 2.0) &
(selected_boxes[:, 3] == 2.0) &
(selected_scores[:, 0] == 1.0))
# If all detections were filtered (meaning we only had dummies),
# keep one dummy to avoid division by zero errors
if not torch.any(valid_mask) and n > 0:
valid_mask[0] = True
# Apply the filtering
if torch.any(~valid_mask):
selected_boxes = selected_boxes[valid_mask]
selected_scores = selected_scores[valid_mask]
selected_classes = selected_classes[valid_mask]
selected_masks = selected_masks[valid_mask]
batch_index = batch_index[valid_mask]
n = valid_mask.sum()
pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 'half_pixel', 'avg', int(mh), int(mw), 0, 0.25)
masks_protos = selected_masks.unsqueeze(dim=1) @ pooled_proto.float().view(n, c, mh * mw)
masks_protos = masks_protos.sigmoid().view(-1, mh * mw)
dets = torch.cat([selected_boxes, selected_scores, selected_classes, masks_protos], dim=1)
batched_dets = dets.unsqueeze(0).repeat(b, 1, 1)
batch_template = torch.arange(0, b, dtype=batch_index.dtype, device=batch_index.device).unsqueeze(1)
batched_dets = batched_dets.where((batch_index == batch_template).unsqueeze(-1), batched_dets.new_zeros(1))
y, i = batched_dets.shape[1:]
final_dets = batched_dets.new_zeros((b, self.max_det, i))
final_dets[:, :y, :] = batched_dets
final_boxes = final_dets[:, :, :4]
final_scores = final_dets[:, :, 4:5]
final_classes = final_dets[:, :, 5:6]
final_masks = final_dets[:, :, 6:]
final_masks = final_masks.view(b, -1, mh, mw)
return final_boxes, final_scores, final_classes, final_masks
export_yoloV8_seg.py全部代码如下:
import os
import sys
import random
import argparse
import warnings
import onnx
import torch
import torch.nn as nn
from copy import deepcopy
from ultralytics import YOLO
from ultralytics.utils.torch_utils import select_device
from ultralytics.nn.modules import C2f, Detect, RTDETRDecoder
class RoiAlign(torch.autograd.Function):
@staticmethod
def forward(self, X, rois, batch_indices, coordinate_transformation_mode='half_pixel', mode='avg', output_height=160,
output_width=160, sampling_ratio=0, spatial_scale=0.25):
N, C, H, W = X.shape
num_rois = rois.shape[0]
return torch.randn((num_rois, C, output_height, output_width), device=rois.device, dtype=rois.dtype)
@staticmethod
def symbolic(g, X, rois, batch_indices, coordinate_transformation_mode='half_pixel', mode='avg', output_height=160,
output_width=160, sampling_ratio=0, spatial_scale=0.25):
return g.op("RoiAlign", X, rois, batch_indices, coordinate_transformation_mode_s=coordinate_transformation_mode,
mode_s=mode, output_height_i=output_height, output_width_i=output_width, sampling_ratio_i=sampling_ratio,
spatial_scale_f=spatial_scale)
class NMS(torch.autograd.Function):
@staticmethod
def forward(self, boxes, scores, max_output_boxes_per_class=100, iou_threshold=0.45, score_threshold=0.25):
device = boxes.device
batch = scores.shape[0]
num_det = random.randint(0, 100)
# Always ensure at least one detection is returned
if num_det == 0:
num_det = 1
batches = torch.randint(0, batch, (num_det,)).sort()[0].to(device)
idxs = torch.arange(100, 100 + num_det).to(device)
zeros = torch.zeros((num_det,), dtype=torch.int64).to(device)
selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], 0).T.contiguous()
selected_indices = selected_indices.to(torch.int64)
return selected_indices
@staticmethod
def symbolic(g, boxes, scores, max_output_boxes_per_class=100, iou_threshold=0.45, score_threshold=0.25):
return g.op("NonMaxSuppression", boxes, scores, torch.tensor([max_output_boxes_per_class]),
torch.tensor([iou_threshold]), torch.tensor([score_threshold]), center_point_box_i=0)
class DeepStreamOutput(nn.Module):
def __init__(self, nc, conf_thres=0.25, iou_thres=0.45, max_det=100):
self.nc = nc
self.conf_thres = conf_thres
self.iou_thres = iou_thres
self.max_det = max_det
super().__init__()
def forward(self, x):
preds = x[0].transpose(1, 2)
boxes = preds[:, :, :4]
scores, classes = torch.max(preds[:, :, 4:self.nc+4], 2, keepdim=True)
classes = classes.float()
masks = preds[:, :, self.nc+4:]
protos = x[1]
convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype,
device=boxes.device)
# Add a dummy detection with specific characteristics if no detections meet the threshold
batch_size = boxes.shape[0]
for b in range(batch_size):
# Check if any scores in this batch meet the threshold
if not torch.any(scores[b] > self.conf_thres):
# Create a dummy detection with specific values
# Use index 0 as our dummy object
boxes[b, 0, :] = torch.tensor([1.0, 1.0, 2.0, 2.0], device=boxes.device)
scores[b, 0, 0] = 1.0 # Set a high confidence score
classes[b, 0, 0] = float(self.nc) # 设置为超出正常类别范围的类别索引
convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype,
device=boxes.device)
boxes = boxes @ convert_matrix
selected_indices = NMS.apply(boxes, scores.transpose(1, 2).contiguous(), self.max_det, self.conf_thres,
self.iou_thres)
b, c, mh, mw = protos.shape
n = selected_indices.shape[0]
batch_index = selected_indices[:, 0]
box_index = selected_indices[:, 2]
selected_boxes = boxes[batch_index, box_index, :]
selected_scores = scores[batch_index, box_index, :]
selected_classes = classes[batch_index, box_index, :]
selected_masks = masks[batch_index, box_index, :]
# Filter out any dummy detections we added
valid_mask = ~((selected_boxes[:, 0] == 1.0) &
(selected_boxes[:, 1] == 1.0) &
(selected_boxes[:, 2] == 2.0) &
(selected_boxes[:, 3] == 2.0) &
(selected_scores[:, 0] == 1.0))
# If all detections were filtered (meaning we only had dummies),
# keep one dummy to avoid division by zero errors
if not torch.any(valid_mask) and n > 0:
valid_mask[0] = True
# Apply the filtering
if torch.any(~valid_mask):
selected_boxes = selected_boxes[valid_mask]
selected_scores = selected_scores[valid_mask]
selected_classes = selected_classes[valid_mask]
selected_masks = selected_masks[valid_mask]
batch_index = batch_index[valid_mask]
n = valid_mask.sum()
pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 'half_pixel', 'avg', int(mh), int(mw), 0, 0.25)
masks_protos = selected_masks.unsqueeze(dim=1) @ pooled_proto.float().view(n, c, mh * mw)
masks_protos = masks_protos.sigmoid().view(-1, mh * mw)
dets = torch.cat([selected_boxes, selected_scores, selected_classes, masks_protos], dim=1)
batched_dets = dets.unsqueeze(0).repeat(b, 1, 1)
batch_template = torch.arange(0, b, dtype=batch_index.dtype, device=batch_index.device).unsqueeze(1)
batched_dets = batched_dets.where((batch_index == batch_template).unsqueeze(-1), batched_dets.new_zeros(1))
y, i = batched_dets.shape[1:]
final_dets = batched_dets.new_zeros((b, self.max_det, i))
final_dets[:, :y, :] = batched_dets
final_boxes = final_dets[:, :, :4]
final_scores = final_dets[:, :, 4:5]
final_classes = final_dets[:, :, 5:6]
final_masks = final_dets[:, :, 6:]
final_masks = final_masks.view(b, -1, mh, mw)
return final_boxes, final_scores, final_classes, final_masks
def suppress_warnings():
warnings.filterwarnings('ignore', category=torch.jit.TracerWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
def yolov8_export(weights, device):
model = YOLO(weights)
model = deepcopy(model.model).to(device)
for p in model.parameters():
p.requires_grad = False
model.eval()
model.float()
model = model.fuse()
for k, m in model.named_modules():
if isinstance(m, (Detect, RTDETRDecoder)):
m.dynamic = False
m.export = True
m.format = 'onnx'
elif isinstance(m, C2f):
m.forward = m.forward_split
return model
def main(args):
suppress_warnings()
print('\nStarting: %s' % args.weights)
print('Opening YOLOv8-Seg model\n')
device = select_device('cpu')
model = yolov8_export(args.weights, device)
if len(model.names.keys()) > 0:
print('\nCreating labels.txt file')
f = open('labels.txt', 'w')
for name in model.names.values():
f.write(name + '\n')
f.close()
model = nn.Sequential(model, DeepStreamOutput(len(model.names), args.conf_thres, args.iou_thres, args.max_det))
img_size = args.size * 2 if len(args.size) == 1 else args.size
onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
onnx_output_file = os.path.basename(args.weights).split('.pt')[0] + '.onnx'
dynamic_axes = {
'input': {
0: 'batch'
},
'boxes': {
0: 'batch'
},
'scores': {
0: 'batch'
},
'classes': {
0: 'batch'
},
'masks': {
0: 'batch'
}
}
print('\nExporting the model to ONNX')
torch.onnx.export(model, onnx_input_im, onnx_output_file, verbose=False, opset_version=args.opset,
do_constant_folding=True, input_names=['input'], output_names=['boxes', 'scores', 'classes', 'masks'],
dynamic_axes=dynamic_axes if args.dynamic else None)
if args.simplify:
print('Simplifying the ONNX model')
import onnxsim
model_onnx = onnx.load(onnx_output_file)
model_onnx, _ = onnxsim.simplify(model_onnx)
onnx.save(model_onnx, onnx_output_file)
print('Done: %s\n' % onnx_output_file)
def parse_args():
parser = argparse.ArgumentParser(description='DeepStream YOLOv8-Seg conversion')
parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)')
parser.add_argument('-s', '--size', nargs='+', type=int, default=[640], help='Inference size [H,W] (default [640])')
parser.add_argument('--opset', type=int, default=16, help='ONNX opset version')
parser.add_argument('--simplify', action='store_true', help='ONNX simplify model')
parser.add_argument('--dynamic', action='store_true', help='Dynamic batch-size')
parser.add_argument('--batch', type=int, default=1, help='Static batch-size')
parser.add_argument('--conf-thres', type=float, default=0.25, help='Minimum confidence threshold')
parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
parser.add_argument('--max-det', type=int, default=100, help='Maximum detections')
args = parser.parse_args()
if not os.path.isfile(args.weights):
raise SystemExit('Invalid weights file')
if args.dynamic and args.batch > 1:
raise SystemExit('Cannot set dynamic batch-size and static batch-size at same time')
return args
if __name__ == '__main__':
args = parse_args()
sys.exit(main(args))
使用此代码转换成的onnx生成trt engine后再运行,尽管视野里无目标但也没有异常退出