文章目录
0 引言
为了方便后面的复用以及扩展,所以将yolov5的后处理封装到一个class类里面,然后相应的操作用kernel函数实现,这样进行封装之后,后面在server端使用的时候,可以直接复用。
1 yolov5s_postprocess.h
/**
* @file yolov5_postprocess.h
* @brief YOLOv5s post-processing class definition
*
* @author cumtchw
* @date 2025-07-17
* @version 1.0(2025-07-17): Initial version.
*
* Copyright (c) 2025 cumtchw. All rights reserved.
*/
#ifndef _YOLOV5_POSTPROCESS_H_
#define _YOLOV5_POSTPROCESS_H_
#include <vector>
#include <cuda_runtime.h>
#include <iostream>
#include <algorithm>
#include <functional>
#include "yolo.h"
#include "layer_params_tool.h"
namespace yolov5s{
class Yolov5sPostProcess{
public:
explicit Yolov5sPostProcess(int maxBatchSize);
Yolov5sPostProcess();
~Yolov5sPostProcess();
int initialize(int batchSize, int boxNum, int elemSize);
int setParams(int netW, int netH, int classNum, float objThresh, const std::vector<float>& nmsThresh, const std::vector<float>& classThresh);
int enqueue(
int batchSize,
int boxNum,
int elemSize,
const void *const *inputs,
void *const *outputs
);
int terminate();
cudaStream_t stream_;
private:
//yolo::Yolov5Param yolov5_param_;
int net_w_ = 640;
int net_h_ = 640;
int class_num_ = 80;
float obj_thresh_ = 0.5;
int max_batch_size_ = 1024;
std::vector<float> nms_thresh_ = std::vector<float>(class_num_, 0.5);
std::vector<float> class_thresh_ = std::vector<float>(class_num_, 0.25);
int max_box_num_ = 25200;
float *raw_yolo_batch_data_{};
float* decoded_yolo_data_host_{};
float* decoded_yolo_data_device_{};
float* class_thresh_data_{};
float* nms_thresh_data_{};
std::vector<float*> refined_yolov5s_data_host_; //by classes
std::vector<float*> refined_yolov5s_data_device_; //by classes
std::vector<MASK_TYPE*> nms_mask_data_; //by classes
int nms_mask_num_{}; //size of type /MASK_TYPE/ num
float* topdata_{};
};
}// end namespace yolov5s
#endif
2 yolov5s_postprocess.cc
#include "yolov5s_postprocess.h"
#include "yolo.h"
#include <opencv2/opencv.hpp>
#include <opencv2/core/version.hpp>
#if CV_MAJOR_VERSION == 2
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#elif CV_MAJOR_VERSION >= 3
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
#endif
using namespace cv;
namespace yolov5s{
Yolov5sPostProcess::Yolov5sPostProcess(int maxBatchSize)
: max_batch_size_(maxBatchSize){}
Yolov5sPostProcess::Yolov5sPostProcess() = default;
Yolov5sPostProcess::~Yolov5sPostProcess() = default;
int Yolov5sPostProcess::initialize(int batchSize, int boxNum, int elemSize) {
cudaStreamCreate(&stream_);
int raw_elem_size = class_num_ + 5;
size_t raw_buffer_size = static_cast<size_t>(max_batch_size_) * max_box_num_ * raw_elem_size * sizeof(float);
cudaError_t err;
//用来保存yolov5刚输出的数据,从cpu拷贝到gpu上。
err = cudaMalloc(&raw_yolo_batch_data_, raw_buffer_size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed for raw_yolo_batch_data_, size=" << raw_buffer_size
<< ", error: " << cudaGetErrorString(err) << std::endl;
return -1;
}
size_t decoded_buffer_size = static_cast<size_t>(max_box_num_) * OUT_ELEM_SIZE * sizeof(float);
//用来保存gpu上的decoded数据
err = cudaMalloc(&decoded_yolo_data_device_, decoded_buffer_size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed for decoded_yolo_data_device_, size=" << decoded_buffer_size
<< ", error: " << cudaGetErrorString(err) << std::endl;
cudaFree(raw_yolo_batch_data_);
return -1;
}
//用来保存cpu上的decoded数据
err = cudaHostAlloc(&decoded_yolo_data_host_, decoded_buffer_size, cudaHostAllocDefault);
if (err != cudaSuccess) {
std::cerr << "cudaHostAlloc failed for decoded_yolo_data_host_, size=" << decoded_buffer_size
<< ", error: " << cudaGetErrorString(err) << std::endl;
cudaFree(raw_yolo_batch_data_);
cudaFree(decoded_yolo_data_device_);
return -1;
}
err = cudaMalloc(&class_thresh_data_, class_num_ * sizeof(float));
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed for class_thresh_data_: " << cudaGetErrorString(err) << std::endl;
terminate(); // 释放上面申请的
return -1;
}
err = cudaMalloc(&nms_thresh_data_, class_num_ * sizeof(float));
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed for nms_thresh_data_: " << cudaGetErrorString(err) << std::endl;
terminate();
return -1;
}
err = cudaMemcpy(class_thresh_data_, class_thresh_.data(), class_num_ * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "cudaMemcpy failed for class_thresh_data_: " << cudaGetErrorString(err) << std::endl;
terminate();
return -1;
}
err = cudaMemcpy(nms_thresh_data_, nms_thresh_.data(), class_num_ * sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "cudaMemcpy failed for nms_thresh_data_: " << cudaGetErrorString(err) << std::endl;
terminate();
return -1;
}
refined_yolov5s_data_host_.resize(class_num_);
refined_yolov5s_data_device_.resize(class_num_);
for (int i = 0; i < class_num_; ++i) {
size_t buffer_size = max_box_num_ * OUT_ELEM_SIZE * sizeof(float);
err = cudaHostAlloc((void**)&refined_yolov5s_data_host_[i], buffer_size, cudaHostAllocDefault);
if (err != cudaSuccess) {
std::cerr << "cudaHostAlloc failed for refined_yolov5s_data_host_[" << i << "]: "
<< cudaGetErrorString(err) << std::endl;
terminate();
return -1;
}
err = cudaMalloc((void**)&refined_yolov5s_data_device_[i], buffer_size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed for refined_yolov5s_data_device_[" << i << "]: "
<< cudaGetErrorString(err) << std::endl;
terminate();
return -1;
}
}
const int col_blocks = DIVUP(boxNum, threadsPerBlock);
nms_mask_num_ = boxNum * col_blocks;
nms_mask_data_.resize(class_num_);
// 逐类别分配 GPU 内存
for (int i = 0; i < class_num_; ++i) {
cudaError_t err = cudaMalloc((void**)&nms_mask_data_[i], nms_mask_num_ * sizeof(MASK_TYPE));
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed for nms_mask_data_[" << i << "], size: "
<< nms_mask_num_ * sizeof(MASK_TYPE) << ", error: "
<< cudaGetErrorString(err) << std::endl;
return -1;
}
}
// topdata_ 是 float*,大小 OBJECT_TOPK * OUT_ELEM_SIZE 个 float
size_t topdata_size = OBJECT_TOPK * OUT_ELEM_SIZE * sizeof(float);
err = cudaMalloc((void**)&topdata_, topdata_size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed for topdata_, size: " << topdata_size
<< ", error: " << cudaGetErrorString(err) << std::endl;
return -1;
}
return 0;
}
int Yolov5sPostProcess::setParams(
int netW,
int netH,
int classNum,
float objThresh,
const std::vector<float>& nmsThresh,
const std::vector<float>& classThresh)
{
net_w_ = netW;
net_h_ = netH;
class_num_ = classNum;
obj_thresh_ = objThresh;
nms_thresh_ = nmsThresh;
class_thresh_ = classThresh;
return 0;
}
int Yolov5sPostProcess::enqueue(
int batchSize,
int boxNum,
int elemSize,
const void *const *inputs,
void *const *outputs
)
{
std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now();
const float* cpu_input = static_cast<const float*>(inputs[0]);
unsigned long long input_size = batchSize * boxNum * elemSize * sizeof(float);
cudaMemcpyAsync(raw_yolo_batch_data_, cpu_input, input_size, cudaMemcpyHostToDevice, stream_);
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< cudaMemcpyAsync time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count() * 1000 << " ms" << std::endl;
for(int batch = 0; batch < batchSize; ++batch)
{
float *batch_input = raw_yolo_batch_data_ + batch * boxNum * elemSize;
yolov5sDecodeGpu(batch_input,
boxNum,
elemSize,
class_num_,
obj_thresh_,
class_thresh_data_,
decoded_yolo_data_device_,
stream_);
cudaStreamSynchronize(stream_);
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< yolov5sDecodeGpu time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count() * 1000 << " ms" << std::endl;
cudaMemcpyAsync(decoded_yolo_data_host_, decoded_yolo_data_device_, boxNum * OUT_ELEM_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream_);
cudaStreamSynchronize(stream_);
std::chrono::high_resolution_clock::time_point t3 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< cudaMemcpyAsync decoded_yolo_data_host_ time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t3 - t2).count() * 1000 << " ms" << std::endl;
std::vector<int> refinedBoxNumHost(class_num_);
refineYolov5sBoxesCpu(decoded_yolo_data_host_,
boxNum,
OUT_ELEM_SIZE,
class_num_,
refined_yolov5s_data_host_,
refinedBoxNumHost
);
std::chrono::high_resolution_clock::time_point t4 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< refineYolov5sBoxesCpu time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t4 - t3).count() * 1000 << " ms" << std::endl;
for (int c = 0; c < class_num_; ++c) {
if (refinedBoxNumHost[c] == 0) continue;
//这里要用OUT_ELEM_SIZE,不是elemSize。
int size = refinedBoxNumHost[c] * OUT_ELEM_SIZE * sizeof(float);
cudaMemcpyAsync(refined_yolov5s_data_device_[c], // device 端目标
refined_yolov5s_data_host_[c], // host 端来源
size,
cudaMemcpyHostToDevice,
stream_);
}
std::chrono::high_resolution_clock::time_point t5 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< cudaMemcpyAsync time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t5 - t4).count() * 1000 << " ms" << std::endl;
std::vector<std::vector<int>> keepIndices(class_num_); // host上保存每类最终保留的索引
std::vector<int> outputNums(class_num_, 0); // 每类保留的 box 数
for (int c = 0; c < class_num_; ++c) {
if (refinedBoxNumHost[c] == 0) continue;
keepIndices[c].resize(refinedBoxNumHost[c]);
// 清空 NMS mask
cudaMemsetAsync(nms_mask_data_[c], 0, nms_mask_num_ * sizeof(int), stream_);
yolov5sNmsGpu(
refined_yolov5s_data_device_[c],
refinedBoxNumHost[c],
nms_mask_data_[c],
nms_thresh_data_[c],
keepIndices[c].data(),
&outputNums[c],
OUT_ELEM_SIZE,
stream_
);
}
cudaStreamSynchronize(stream_);
std::chrono::high_resolution_clock::time_point t6 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< yolov5sNmsGpu time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t6 - t5).count() * 1000 << " ms" << std::endl;
// 初始化 topdata_,大小为 OBJECT_TOPK * elemSize
if (cudaMemset(topdata_, 0xFF, OBJECT_TOPK * OUT_ELEM_SIZE) != 0) {
// fallback CPU填充
for (int i = 0; i < OBJECT_TOPK * elemSize; ++i)
topdata_[i] = -1.f;
}
int k = 1; // topdata_[0] 用来存数量,k从1开始写数据
bool finished = false;
for (int c = 0; c < class_num_ && !finished; ++c) {
int n = outputNums[c];
float* class_data = refined_yolov5s_data_device_[c];
for (int j = 0; j < n; ++j) {
float* box_data = class_data + keepIndices[c][j] * elemSize;
for (int idx = 0; idx < elemSize; ++idx) {
topdata_[k * OUT_ELEM_SIZE + idx] = box_data[idx];
}
k++;
if (k > OBJECT_TOPK - 1) {
finished = true;
break;
}
}
}
topdata_[0] = k - 1; // 最终box数量写第0个元素
std::chrono::high_resolution_clock::time_point t7 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< cudaMemset time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t7 - t6).count() * 1000 << " ms" << std::endl;
int data_size = OBJECT_TOPK * OUT_ELEM_SIZE * sizeof(float);
cudaMemcpyAsync((void*)(*outputs) + batch * data_size, topdata_, data_size, cudaMemcpyDeviceToHost, stream_);
std::chrono::high_resolution_clock::time_point t8 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< cudaMemcpyAsync time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t8 - t7).count() * 1000 << " ms" << std::endl;
}
for(int batch = 0; batch < batchSize; ++batch)
{
float * resultPtr = (float *)*outputs + batch * OBJECT_TOPK * OUT_ELEM_SIZE;
for(int i = 0; i < OBJECT_TOPK; i++)
{
std::cout<<"i===="<<i<<std::endl;
for(int j = 0; j < OUT_ELEM_SIZE; j++)
{
std::cout<<"resultPtr["<<j<<"]:::::::::"<<resultPtr[j]<<std::endl;
}
resultPtr = resultPtr + OUT_ELEM_SIZE;
}
}
std::chrono::high_resolution_clock::time_point t9 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< enqueue inside all time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t9 - t0).count() * 1000 << " ms" << std::endl;
//cudaStreamDestroy(stream_);
return 0;
}
int Yolov5sPostProcess::terminate() {
if (raw_yolo_batch_data_) {
cudaFree(raw_yolo_batch_data_);
raw_yolo_batch_data_ = nullptr;
}
if (decoded_yolo_data_device_) {
cudaFree(decoded_yolo_data_device_);
decoded_yolo_data_device_ = nullptr;
}
if (decoded_yolo_data_host_) {
cudaFreeHost(decoded_yolo_data_host_);
decoded_yolo_data_host_ = nullptr;
}
if (class_thresh_data_) {
cudaFree(class_thresh_data_);
class_thresh_data_ = nullptr;
}
if (nms_thresh_data_) {
cudaFree(nms_thresh_data_);
nms_thresh_data_ = nullptr;
}
for (int i = 0; i < class_num_; ++i) {
if (refined_yolov5s_data_host_[i]) {
cudaFreeHost(refined_yolov5s_data_host_[i]);
refined_yolov5s_data_host_[i] = nullptr;
}
if (refined_yolov5s_data_device_[i]) {
cudaFree(refined_yolov5s_data_device_[i]);
refined_yolov5s_data_device_[i] = nullptr;
}
}
refined_yolov5s_data_host_.clear();
refined_yolov5s_data_device_.clear();
for (int i = 0; i < class_num_; ++i) {
if (nms_mask_data_[i]) {
cudaFree(nms_mask_data_[i]);
nms_mask_data_[i] = nullptr;
}
}
nms_mask_data_.clear();
if (topdata_) {
cudaFree(topdata_);
topdata_ = nullptr;
}
return 0;
}
}// end namespace yolov5s
3 yolo.h
#ifndef _YOLO_H_
#define _YOLO_H_
#include "cuda.h"
#include <vector>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
#define DEVICE_INLINE __device__ __forceinline__
using MASK_TYPE = unsigned long long;
const int threadsPerBlock = sizeof(MASK_TYPE) * 8;
const int MAX_CLASSES = 80;
//const int OBJECT_TOPK = 300;
const int OBJECT_TOPK = 100;
const int OUT_ELEM_SIZE = 6;
//yolov5s decode
void yolov5sDecodeGpu(const float* input, const int input_box_num, const int input_elem_size
, const int classes, const float obj_thresh, const float* class_thresh, float* decoded_obb_data, cudaStream_t ss);
void refineYolov5sBoxesCpu(const float* decoded_boxes,
int box_num,
int elem_size, // == 6
int class_num,
std::vector<float*>& outputs,
std::vector<int>& box_nums);
// yolov5s NMS
void yolov5sNmsGpu(const float* input, int obbsNum, MASK_TYPE* nmsMask, float nmsThresh, int* keepOut, int* numOut, int elemSize, cudaStream_t stream);
#endif
4 yolov5s_postprocess.cu
#include "yolo.h"
#include "yolov5s_postprocess.h"
static const int YOLOV5S_CUDA_NUM_THREADS = 512;
__global__ void __launch_bounds__(1024) yolov5sDecodeKernel(const float* input, int input_box_num, int input_elem_size,
int classes, float obj_thresh, const float* class_thresh,
float* decoded_data) {
const int OUT_ELEM_SIZE = 6; // class_id + score + x1 + y1 + x2 + y2
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= input_box_num) return;
const float* cur_input = input + idx * input_elem_size;
float cx = cur_input[0];
float cy = cur_input[1];
float w = cur_input[2];
float h = cur_input[3];
float obj_conf = cur_input[4];
if (obj_conf < obj_thresh) {
decoded_data[idx * OUT_ELEM_SIZE + 1] = 0.f;
return;
}
// 找最大 class score
int class_id = -1;
float class_conf = -1.f;
for (int c = 0; c < classes; ++c) {
float conf = cur_input[5 + c];
if (conf > class_conf) {
class_conf = conf;
class_id = c;
}
}
float final_score = obj_conf * class_conf;
if (final_score < class_thresh[class_id]) {
decoded_data[idx * OUT_ELEM_SIZE + 1] = 0.f;
return;
}
float x1 = cx - w / 2;
float y1 = cy - h / 2;
float x2 = cx + w / 2;
float y2 = cy + h / 2;
float* out = decoded_data + idx * OUT_ELEM_SIZE;
out[0] = static_cast<float>(class_id); // class_id
out[1] = final_score; // confidence
out[2] = x1;
out[3] = y1;
out[4] = x2;
out[5] = y2;
}
void yolov5sDecodeGpu(const float* input, const int input_box_num, const int input_elem_size
, const int classes, const float obj_thresh, const float* class_thresh, float* decoded_yolo_data, cudaStream_t ss) {
if (input == nullptr || decoded_yolo_data == nullptr) {
return;
}
yolov5sDecodeKernel<<<DIVUP(input_box_num, YOLOV5S_CUDA_NUM_THREADS), YOLOV5S_CUDA_NUM_THREADS, 0, ss >>> (
input, input_box_num, input_elem_size, classes, obj_thresh, class_thresh, decoded_yolo_data);
}
void refineYolov5sBoxesCpu(const float* decoded_boxes,
int box_num,
int elem_size, // == 6
int class_num,
std::vector<float*>& outputs,
std::vector<int>& box_nums) {
float* data = const_cast<float*>(decoded_boxes);
std::vector<float*> outputs_data(class_num);
box_nums.assign(class_num, 0); // 清空
auto data_next = [&data, elem_size]() {
data += elem_size;
};
for (int i = 0; i < box_num; ++i) {
float class_id = data[0];
float score = data[1];
if (score < 0.2) {
data_next();
continue;
}
int cls = static_cast<int>(class_id);
if (outputs_data[cls] == nullptr) {
outputs_data[cls] = outputs[cls];
}
float* output = outputs_data[cls] + box_nums[cls] * elem_size;
for (int j = 0; j < elem_size; ++j) {
output[j] = data[j];
}
box_nums[cls]++;
data_next();
}
// 排序:每类按 score 从大到小排序
struct NormalizedBBox {
float class_id;
float score;
float x1, y1, x2, y2;
bool operator>(const NormalizedBBox& other) const {
return score > other.score;
}
};
for (int i = 0; i < class_num; ++i) {
int n = box_nums[i];
if (n == 0) continue;
NormalizedBBox* ptr = reinterpret_cast<NormalizedBBox*>(outputs[i]);
std::sort(ptr, ptr + n, std::greater<NormalizedBBox>());
}
return;
}
// 计算两个矩形框IoU的device函数,格式为[x1,y1,x2,y2]
__device__ float boxIoU(const float* a, const float* b) {
float x1 = max(a[0], b[0]);
float y1 = max(a[1], b[1]);
float x2 = min(a[2], b[2]);
float y2 = min(a[3], b[3]);
float w = max(0.0f, x2 - x1);
float h = max(0.0f, y2 - y1);
float inter = w * h;
float area_a = (a[2] - a[0]) * (a[3] - a[1]);
float area_b = (b[2] - b[0]) * (b[3] - b[1]);
float union_area = area_a + area_b - inter;
return inter / union_area;
}
__global__ void __launch_bounds__(1024) yolov5s_nms_kernel(
const int n_boxes,
const float nms_threshold,
const float* dev_boxes, // [n_boxes * elemSize], 每个box有elemSize个float,前4是坐标,第5是score
MASK_TYPE* dev_mask,
const int elemSize
) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5]; // 假设elemSize=5,你可改为动态分配
__shared__ MASK_TYPE block_mask[threadsPerBlock];
if (threadIdx.x < col_size) {
for (int i = 0; i < elemSize; ++i) {
block_boxes[threadIdx.x * elemSize + i] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * elemSize + i];
}
}
__syncthreads();
if (threadIdx.x < row_size) {
int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float* cur_box = dev_boxes + cur_box_idx * elemSize;
MASK_TYPE t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1; // 对角线以上才比较,避免重复
}
for (int i = start; i < col_size; ++i) {
const float* comp_box = block_boxes + i * elemSize;
if (boxIoU(cur_box, comp_box) > nms_threshold) {
t |= 1ULL << i;
}
}
block_mask[threadIdx.x] = t;
}
__syncthreads();
if (threadIdx.x < row_size) {
int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
dev_mask[cur_box_idx * ((n_boxes + threadsPerBlock - 1) / threadsPerBlock) + col_start] = block_mask[threadIdx.x];
}
}
void yolov5sNmsGpu(
const float* dev_boxes,
int box_num,
MASK_TYPE* dev_mask,
float nms_thresh,
int* keep_indices,
int* keep_count,
int elemSize,
cudaStream_t stream
) {
if (dev_boxes == nullptr || box_num == 0 || dev_mask == nullptr) {
// 错误检查
return;
}
int col_blocks = (box_num + threadsPerBlock - 1) / threadsPerBlock;
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
yolov5s_nms_kernel<<<blocks, threads, 0, stream>>>(
box_num, nms_thresh, dev_boxes, dev_mask, elemSize);
cudaStreamSynchronize(stream);
std::vector<MASK_TYPE> remv(col_blocks, 0);
int num_to_keep = 0;
for (int i = 0; i < box_num; ++i) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_indices[num_to_keep++] = i;
MASK_TYPE* p = dev_mask + i * col_blocks;
for (int j = nblock; j < col_blocks; ++j) {
remv[j] |= p[j];
}
}
}
*keep_count = num_to_keep;
}
5 编译以及解决错误
cmake -DCMAKE_INSTALL_PREFIX=../install -DTRITON_VERSION=`cat /volume/triton_client_code_2.37.0/TRITON_VERSION` -DTRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server -DTRITON_COMMON_REPO_TAG=main -DTRITON_CORE_REPO_TAG=main -DTRITON_THIRD_PARTY_REPO_TAG=main -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF -DTRITON_ENABLE_JAVA_HTTP=OFF -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON -DTRITON_ENABLE_GPU=OFF ..
CMake Error at build/_deps/repo-third-party-src/CMakeLists.txt:26 (cmake_minimum_required):
CMake 3.31.8 or higher is required. You are running version 3.22.1
我猜测是下载的third_party是main,最新的,所以他需要的cmake版本也比较高,我去看了下https://github.com/triton-inference-server/third_party
,我发现CMakeLists.txt上周更新过,就是提高了对cmake版本的要求,这就是为什么同样的容器,我两周前编译没问题,这次编译报错的原因。
本来没想升级cmake,我是降低了thirdpart,不用main,用旧的,然后发现更是各种问题,算了,兜了一大圈,最后还是要升级cmake。
直接去https://cmake.org/download/下载4.0.5,然后重新cmake,编译,又是各种错误,
然后我把client代码也给换了,直接下载最新的main,然后中间还是有各种错误,
然后重新安装3.31.8的cmake(本来我就觉得安装最新的4.0.5可能最新了,没想到果然有问题),用3.31.8的cmake,然后用最新的main的client代码,可以了。
cmake -DCMAKE_INSTALL_PREFIX=../install -DTRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server -DTRITON_COMMON_REPO_TAG=main -DTRITON_CORE_REPO_TAG=main -DTRITON_THIRD_PARTY_REPO_TAG=main -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF -DTRITON_ENABLE_JAVA_HTTP=OFF -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON -DTRITON_ENABLE_GPU=OFF ..
#make cc-clients clean
make cc-clients -j16
然后满屏幕的error
In file included from /volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:1:
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.h:32:13: error: 'cudaStream_t' has not been declared
32 | cudaStream_t stream
| ^~~~~~~~~~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.h:53:21: error: 'MASK_TYPE' was not declared in this scope
53 | std::vector<MASK_TYPE*> nms_mask_data_; //by classes
| ^~~~~~~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.h:53:31: error: template argument 1 is invalid
53 | std::vector<MASK_TYPE*> nms_mask_data_; //by classes
| ^
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.h:53:31: error: template argument 2 is invalid
In file included from /volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:2:
/volume/triton_client_main/src/c++/examples/yolo.h:25:102: error: 'cudaStream_t' has not been declared
25 | , const int classes, const float obj_thresh, const float* class_thresh, float* decoded_obb_data, cudaStream_t ss);
| ^~~~~~~~~~~~
/volume/triton_client_main/src/c++/examples/yolo.h:28:117: error: 'cudaStream_t' has not been declared
28 | void yolov5sNmsGpu(const float* input, int obbsNum, MASK_TYPE* nmsMask, float nmsThresh, int* keepOut, int* numOut, cudaStream_t stream);
| ^~~~~~~~~~~~
In file included from /volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:2:
/volume/triton_client_main/src/c++/examples/yolo.h:32:118: error: 'cudaStream_t' has not been declared
32 | , const int classes, const float obj_thresh, const float* class_thresh, float* decoded_obb_data, cudaStream_t ss) ;
| ^~~~~~~~~~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc: In member function 'int yolov5s::Yolov5sPostProcess::initialize()':
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:14:5: error: 'cudaError_t' was not declared in this scope; did you mean 'cudaError_enum'?
14 | cudaError_t err;
| ^~~~~~~~~~~
| cudaError_enum
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:17:5: error: 'err' was not declared in this scope
17 | err = cudaMalloc(&raw_yolo_batch_data_, raw_buffer_size);
| ^~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:17:11: error: 'cudaMalloc' was not declared in this scope; did you mean 'cuMemAlloc'?
17 | err = cudaMalloc(&raw_yolo_batch_data_, raw_buffer_size);
| ^~~~~~~~~~
| cuMemAlloc
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:18:16: error: 'cudaSuccess' was not declared in this scope
18 | if (err != cudaSuccess) {
| ^~~~~~~~~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:19:14: error: 'cerr' is not a member of 'std'
19 | std::cerr << "cudaMalloc failed for raw_yolo_batch_data_, size=" << raw_buffer_size
| ^~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:3:1: note: 'std::cerr' is defined in header '<iostream>'; did you forget to '#include <iostream>'?
2 | #include "yolo.h"
+++ |+#include <iostream>
3 |
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:20:37: error: 'cudaGetErrorString' was not declared in this scope; did you mean 'cuGetErrorString'?
20 | << ", error: " << cudaGetErrorString(err) << std::endl;
| ^~~~~~~~~~~~~~~~~~
| cuGetErrorString
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:20:69: error: 'endl' is not a member of 'std'
20 | << ", error: " << cudaGetErrorString(err) << std::endl;
| ^~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:3:1: note: 'std::endl' is defined in header '<ostream>'; did you forget to '#include <ostream>'?
2 | #include "yolo.h"
+++ |+#include <ostream>
3 |
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:24:70: error: 'decoded_elem_size_' was not declared in this scope; did you mean 'decoded_buffer_size'?
24 | size_t decoded_buffer_size = static_cast<size_t>(max_box_num_) * decoded_elem_size_ * sizeof(float);
| ^~~~~~~~~~~~~~~~~~
| decoded_buffer_size
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:28:16: error: 'cudaSuccess' was not declared in this scope
28 | if (err != cudaSuccess) {
| ^~~~~~~~~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:29:14: error: 'cerr' is not a member of 'std'
29 | std::cerr << "cudaMalloc failed for decoded_yolo_data_device_, size=" << decoded_buffer_size
| ^~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:29:14: note: 'std::cerr' is defined in header '<iostream>'; did you forget to '#include <iostream>'?
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cu:1:10: fatal error: yolov5s.h: No such file or directory
1 | #include "yolov5s.h"
| ^~~~~~~~~~~
compilation terminated.
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:30:37: error: 'cudaGetErrorString' was not declared in this scope; did you mean 'cuGetErrorString'?
30 | << ", error: " << cudaGetErrorString(err) << std::endl;
| ^~~~~~~~~~~~~~~~~~
| cuGetErrorString
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:30:69: error: 'endl' is not a member of 'std'
30 | << ", error: " << cudaGetErrorString(err) << std::endl;
| ^~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:30:69: note: 'std::endl' is defined in header '<ostream>'; did you forget to '#include <ostream>'?
make[6]: *** [examples/CMakeFiles/yolov5s.dir/build.make:108: examples/CMakeFiles/yolov5s.dir/yolov5s_postprocess.cu.o] Error 1
make[6]: *** Waiting for unfinished jobs....
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:31:9: error: 'cudaFree' was not declared in this scope; did you mean 'cuMemFree'?
31 | cudaFree(raw_yolo_batch_data_);
| ^~~~~~~~
| cuMemFree
[ 98%] Built target yolov7-tiny
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:35:72: error: 'cudaHostAllocDefault' was not declared in this scope
35 | err = cudaHostAlloc(&decoded_yolo_data_host_, decoded_buffer_size, cudaHostAllocDefault);
| ^~~~~~~~~~~~~~~~~~~~
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:35:11: erro
一个一个来解决吧。
中间省略漫长的修改bug的过程。
6 运行以及解决错误
在服务器上,进入server容器,然后
./bin/tritonserver --model-repository=./models_yolov7tiny/ --log-verbose=1
然后客户端先直接跑命令试试,肯定会报错
../install/bin/yolov5s yolov5s /volume/triton_client_main/images/bus/ 1
6.1 error: CUDA driver version is insufficient for CUDA runtime
报错
root:/volume/triton_client_main/build# ../install/bin/yolov5s yolov5s /volume/triton_client_main/images/bus/ 1
model_metadata::::::::::{"name":"yolov5s","versions":["1"],"platform":"onnxruntime_onnx","inputs":[{"name":"images","datatype":"FP32","shape":[-1,3,-1,-1]}],"outputs":[{"name":"output0","datatype":"FP32","shape":[-1,-1,-1]}]}
model_config:::::::{"name":"yolov5s","platform":"onnxruntime_onnx","backend":"onnxruntime","runtime":"","version_policy":{"latest":{"num_versions":1}},"max_batch_size":64,"input":[{"name":"images","data_type":"TYPE_FP32","format":"FORMAT_NONE","dims":[3,-1,-1],"is_shape_tensor":false,"allow_ragged_batch":false,"optional":false}],"output":[{"name":"output0","data_type":"TYPE_FP32","dims":[-1,-1],"label_filename":"","is_shape_tensor":false}],"batch_input":[],"batch_output":[],"optimization":{"priority":"PRIORITY_DEFAULT","input_pinned_memory":{"enable":true},"output_pinned_memory":{"enable":true},"gather_kernel_buffer_threshold":0,"eager_batching":false},"dynamic_batching":{"preferred_batch_size":[4,8,16,32],"max_queue_delay_microseconds":100,"preserve_ordering":false,"priority_levels":0,"default_priority_level":0,"priority_queue_policy":{}},"instance_group":[{"name":"yolov5s_0","kind":"KIND_GPU","count":1,"gpus":[7],"secondary_devices":[],"profile":[],"passive":false,"host_policy":""}],"default_model_filename":"model.onnx","cc_model_filenames":{},"metric_tags":{},"parameters":{},"model_warmup":[]}
=== Parsed ModelInfo ===
input_name_: images
output_name_: output0
input_datatype_: FP32
input_format_: FORMAT_NONE
input shape: [C=3, H=640, W=640]
type1_: 5, type3_: 21
max_batch_size_: 64
[Time] imread: 12.9332 ms
[Time] cvtColor: 4.7253 ms
[Time] resize: 3.06634 ms
[Time] convertTo: 1.27435 ms
[Time] normalize (mul): 1.99668 ms
[Time] input_data resize: 0.566866 ms
[Time] create Mat channel wrappers: 0.00252 ms
[Time] split: 0.506386 ms
[Time] Total preprocess time:================== 25.2356 ms
preprocess time:25.833982 ms
inference time:261.556538 ms
Request 0, batch size 1
cudaMalloc failed for raw_yolo_batch_data_, size=32979088800000, error: CUDA driver version is insufficient for CUDA runtime version
Segmentation fault (core dumped)
我问了下同事他们都没遇到我这种情况,然后我突然想到前面cmake的时候有个-DTRITON_ENABLE_GPU=ON
,然后我这次给改成OFF重新编译试试,发现
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.h:16:10: fatal error: cuda_runtime.h: No such file or directory
16 | #include <cuda_runtime.h>
| ^~~~~~~~~~~~~~~~
compilation terminated.
make[6]: *** [examples/CMakeFiles/yolov5s.dir/build.make:93: examples/CMakeFiles/yolov5s.dir/yolov5s_postprocess.cc.o] Error 1
make[6]: *** Waiting for unfinished jobs....
In file included from /volume/triton_client_main/src/c++/examples/yolov5s.cc:34:
/volume/triton_client_main/src/c++/examples/yolov5s_postprocess.h:16:10: fatal error: cuda_runtime.h: No such file or directory
16 | #include <cuda_runtime.h>
| ^~~~~~~~~~~~~~~~
compilation terminated.
然后我搜了下这个头文件
find / -iname cuda_runtime.h
/usr/local/cuda-12.1/targets/x86_64-linux/include/cuda_runtime.h
/opt/xxx/cuda/targets/x86_64-linux/include/cuda_runtime.h
那么问题来了,这里面怎么还有个/usr/local/cuda-12.1/,这不英伟达的吗,我想起来了,这个triton就是英伟达的镜像,所以他在自己家镜像里面有自己家的cuda,好吧。。
那我把/usr/local/下面的cuda给重命名让编译器找不到他,
mv cuda-12.1/ cuda-12.1_backup
然后继续编译
-- RapidJSON found. Headers: /usr/include
CMake Error at examples/CMakeLists.txt:28 (enable_language):
The CMAKE_CUDA_COMPILER:
/usr/local/cuda/bin/nvcc
is not a full path to an existing compiler tool.
Tell CMake where to find the compiler by setting either the environment
variable "CUDACXX" or the CMake cache entry CMAKE_CUDA_COMPILER to the full
path to the compiler, or to the compiler name if it is in the PATH.
-- Configuring incomplete, errors occurred
然后我
root@:/volume/triton_client_main/build# cd /opt/
root@:/opt# ll
total 20
drwxr-xr-x 1 root root 4096 Jul 18 01:34 ./
drwxr-xr-x 1 root root 4096 Jul 5 09:04 ../
lrwxrwxrwx 1 root root 19 Jul 18 01:34 xxx -> /opt/xxx-25.04-rc4//
drwxr-xr-x 38 root root 4096 Jul 18 01:33 xxx-25.04-rc4/
drwxr-xr-x 10 root root 4096 Sep 8 2023 hpcx/
lrwxrwxrwx 1 root root 17 Jul 18 01:32 xxx -> /usr/local/xxxx//
drwxr-xr-x 5 root root 4096 Sep 8 2023 nvidia/
root@:/opt# source xxx/env.sh
root:/opt# source xxx/cuda/env.sh
重新编译还是同样的错误,
cmake -DCMAKE_INSTALL_PREFIX=../install -DTRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server -DTRITON_COMMON_REPO_TAG=main -DTRITON_CORE_REPO_TAG=main -DTRITON_THIRD_PARTY_REPO_TAG=main -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF -DTRITON_ENABLE_JAVA_HTTP=OFF -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON -DTRITON_ENABLE_GPU=OFF -DCMAKE_CUDA_COMPILER=/opt/xxx/cuda/bin/nvcc ..
我这样重新cmake然后编译还是不行,我突然奇想,这样吧,我直接设置软连接
ln -snf /opt/xxx/cuda /usr/local/cuda
6.2 /usr/bin/ld: cannot find -lcudart_static: No such file or directory
尝试了几个没有解决,
我把跳板机容器的/usr/local/cuda也把英伟达的给替换掉,因为之前只把节点服务器容器的这个/usr/local/cuda英伟达的给替换了,跳板机容器之前没替换。
后来我把cmake里面这一行给换掉
#enable_language(CUDA) #add by cumtchw.
find_package(CUDA)
如果我换成find_package(CUDA),那么不报找不到-lcudart_static的错误了,但是报下面的错误,就是.cu文件没有编译进去,但我觉得正是因为.cu没被编译进去所以才不报-lcudart_static,因为没编译.cu所以这个库根本就不会找。
/usr/bin/ld: yolov5s_postprocess.cc:(.text+0x159c): undefined reference to refineYolov5sBoxesCpu(float const*, int, int, int, std::vector<float*, std::allocator<float*> >&, std::vector<int, std::allocator<int> >&)'
/usr/bin/ld: yolov5s_postprocess.cc:(.text+0x17c5): undefined reference to yolov5sNmsGpu(float const*, int, unsigned long long*, float, int*, int*, int, CUstream_st*)'
collect2: error: ld returned 1 exit status
如果加上enable_language(CUDA),那么不报cu里面函数没定义,但是又开始报cannot find -lcudart_static: No such file or directory。
这个怎么解决呢,还是要用enable_language(CUDA)不要用find_package(CUDA),只不过还要加上这一句让他不要去找静态库
set(CMAKE_CUDA_RUNTIME_LIBRARY "SHARED")
修改后的cmake如下
cmake_minimum_required (VERSION 3.18)
set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)#这一句不起作用,还是会报错 cannot find -lcudart_static: No such file or directory
set(CMAKE_CUDA_RUNTIME_LIBRARY "SHARED")#加上这一句就不报 cannot find -lcudart_static: No such file or directory
set(CMAKE_POSITION_INDEPENDENT_CODE ON) #位置无关,
enable_language(CUDA) #add by cumtchw.
#find_package(CUDA) #新版的cmake里面已经启用这个find_package(CUDA)了。
6.3 relocation R_X86_64_32 against symbol `_Z19yolov5sDecodeKernelPKfiiifS0_Pf’ can not be used when making a PIE object; recompile with -fPIE
[ 71%] Linking CXX executable yolov5s
/usr/bin/ld: CMakeFiles/yolov5s.dir/yolov5s_postprocess.cu.o: relocation R_X86_64_32 against symbol `_Z19yolov5sDecodeKernelPKfiiifS0_Pf' can not be used when making a PIE object; recompile with -fPIE
/usr/bin/ld: failed to set dynamic section sizes: bad value
collect2: error: ld returned 1 exit status
make[6]: *** [examples/CMakeFiles/yolov5s.dir/build.make:261: examples/yolov5s] Error 1
make[5]: *** [CMakeFiles/Makefile2:1229: examples/CMakeFiles/yolov5s.dir/all] Error 2
make[4]: *** [Makefile:136: all] Error 2
这个是位置无关码的问题,增加set(CMAKE_POSITION_INDEPENDENT_CODE ON) #位置无关,
修改后的src/c++/examples/CMakeLists.txt
内容如下(只复制了前半部分),
cmake_minimum_required (VERSION 3.18)
set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)#这一句不起作用,还是会报错 cannot find -lcudart_static: No such file or directory
set(CMAKE_CUDA_RUNTIME_LIBRARY "SHARED")#加上这一句就不报 cannot find -lcudart_static: No such file or directory
set(CMAKE_POSITION_INDEPENDENT_CODE ON) #位置无关,
enable_language(CUDA) #add by cumtchw.
#find_package(CUDA) #新版的cmake里面已经启用这个find_package(CUDA)了。
if(WIN32)
message("C++ examples are not currently supported on Windows because "
"they require functionalities that are UNIX specific.")
else()
if(TRITON_ENABLE_CC_HTTP AND TRITON_ENABLE_CC_GRPC)
#
# yolov5s
#
find_package(OpenCV REQUIRED)
add_executable(
#cuda_add_executable( #这条命令不被识别
yolov5s
yolov5s.cc
yolov5s_postprocess.cc
yolov5s_postprocess.cu
$<TARGET_OBJECTS:json-utils-library>
)
target_include_directories(
yolov5s
PRIVATE ${OpenCV_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}
/opt/xxx/cuda/include/
)
target_link_libraries(
yolov5s
PRIVATE
grpcclient_static
httpclient_static
cudart
${OpenCV_LIBS}
)
install(
TARGETS yolov5s
RUNTIME DESTINATION bin
)
#
# yolov7-tiny
#
find_package(OpenCV REQUIRED)
add_executable(
yolov7-tiny
yolov7-tiny.cc
$<TARGET_OBJECTS:json-utils-library>
)
target_include_directories(
yolov7-tiny
PRIVATE ${OpenCV_INCLUDE_DIRS}
)
target_link_libraries(
yolov7-tiny
PRIVATE
grpcclient_static
httpclient_static
${OpenCV_LIBS}
)
install(
TARGETS yolov7-tiny
RUNTIME DESTINATION bin
)
#
# resnet50
#
find_package(OpenCV REQUIRED)
add_executable(
resnet50
resnet50.cc
$<TARGET_OBJECTS:json-utils-library>
)
target_include_directories(
resnet50
PRIVATE ${OpenCV_INCLUDE_DIRS}
)
target_link_libraries(
resnet50
PRIVATE
grpcclient_static
httpclient_static
${OpenCV_LIBS}
)
install(
TARGETS resnet50
RUNTIME DESTINATION bin
)
#
# image_client
#
find_package(OpenCV REQUIRED)
add_executable(
image_client
image_client.cc
$<TARGET_OBJECTS:json-utils-library>
)
target_include_directories(
image_client
PRIVATE ${OpenCV_INCLUDE_DIRS}
)
target_link_libraries(
image_client
PRIVATE
grpcclient_static
httpclient_static
${OpenCV_LIBS}
)
install(
TARGETS image_client
RUNTIME DESTINATION bin
)
7 再次运行和解决错误
经过上面的折腾,编译通过了,再次运行看看
../install/bin/yolov5s yolov5s /volume/triton_client_main/images/bus/ 1
7.1 launch bounds (256) for kernel _Z19yolov5sDecodeKernelPKfiiifS0_Pf please add launch_bounds to kernel define or use --gpu-max-threads-per-block recompile program !
继续报错
Request 0, batch size 1
Launch params (512, 1, 1) are larger than launch bounds (256) for kernel _Z19yolov5sDecodeKernelPKfiiifS0_Pf please add __launch_bounds__ to kernel define or use --gpu-max-threads-per-block recompile program !
Segmentation fault (core dumped)
这个错误是因为GPUfusion为了程序最高的效率,默认线程的最大值为256,在程序运行中如果出现以下提示,则需要调整最大值的大小。
解决方法在每个kernel函数前面增加 __launch_bounds__(1024)
__global__ void __launch_bounds__(1024) yolov5s_nms_kernel(
7.2 Segmentation fault (core dumped)
[Time] Total preprocess time:================== 32.112 ms
preprocess time:32.991097 ms
inference time:18.917245 ms
Request 0, batch size 1
Segmentation fault (core dumped)
root@nodexxx:/volume/triton_client_main/build#
root@nodexxx:/volume/triton_client_main/build#
root@:/volume/triton_client_main/build# gdb --args ../install/bin/yolov5s yolov5s /volume/triton_client_main/images/bus/ 1
GNU gdb (Ubuntu 12.1-0ubuntu1~22.04.3) 12.1
Copyright (C) 2022 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ../install/bin/yolov5s...
(No debugging symbols found in ../install/bin/yolov5s)
增加-DCMAKE_BUILD_TYPE=Debug
重新cmake,
cmake -DCMAKE_INSTALL_PREFIX=../install -DTRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server -DTRITON_COMMON_REPO_TAG=main -DTRITON_CORE_REPO_TAG=main -DTRITON_THIRD_PARTY_REPO_TAG=main -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF -DTRITON_ENABLE_JAVA_HTTP=OFF -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON -DTRITON_ENABLE_GPU=OFF -DCMAKE_BUILD_TYPE=Debug ..
重新编译,然后运行gdb --args ../install/bin/yolov5s yolov5s /volume/triton_client_main/images/bus/ 1
Thread 1 "yolov5s" received signal SIGSEGV, Segmentation fault.
refineYolov5sBoxesCpu (decoded_boxes=0x7fffe0100000, box_num=25200, elem_size=85, class_num=80, outputs=std::vector of length 80, capacity 80 = {...}, box_nums=std::vector of length 80, capacity 80 = {...}) at /volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cu:86
86 float score = data[1];
(gdb)
(gdb)
(gdb)
(gdb) bt
#0 refineYolov5sBoxesCpu (decoded_boxes=0x7fffe0100000, box_num=25200, elem_size=85, class_num=80, outputs=std::vector of length 80, capacity 80 = {...},
box_nums=std::vector of length 80, capacity 80 = {...}) at /volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cu:86
#1 0x000055555560e95a in yolov5s::Yolov5sPostProcess::enqueue (this=0x555556dd2ab0, batchSize=1, boxNum=25200, elemSize=85, inputs=0x7fffffffcb98,
outputs=0x7fffffffcb60) at /volume/triton_client_main/src/c++/examples/yolov5s_postprocess.cc:173
#2 0x00005555555fa8e7 in (anonymous namespace)::Postprocess (result=std::unique_ptr<triton::client::InferResult> = {...},
filenames=std::vector of length 1, capacity 1 = {...}, batch_size=1, output_name="output0", maxBatchSize=64)
at /volume/triton_client_main/src/c++/examples/yolov5s.cc:262
#3 0x00005555555fd46d in main (argc=4, argv=0x7fffffffd708) at /volume/triton_client_main/src/c++/examples/yolov5s.cc:729
(gdb)
算了,直接vscode单步调试吧。配置launch.json
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "(gdb) Launch",
"type": "cppdbg",
"request": "launch",
"program": "/volume/triton_client_main/install/bin/yolov5s",
"args": [
"yolov5s", "/volume/triton_client_main/images/bus",
"1"
],
"stopAtEntry": false,
"cwd": "${workspaceFolder}",
"environment": [],
"externalConsole": false,
"MIMode": "gdb",
"miDebuggerPath": "/usr/bin/gdb" ,
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "set env LD_LIBRARY_PATH=/usr/local/cuda/lib64/:/opt/xxx-25.04-rc4/lib/:$LD_LIBRARY_PATH",
"ignoreFailures": true
}
]
}
]
}
7.3 时间测试
<<<<<< cudaMemcpyAsync time = 14.1253 ms
<<<<<< yolov5sDecodeGpu time = 6.39939 ms
<<<<<< cudaMemcpyAsync decoded_yolo_data_host_ time = 0.914081 ms
<<<<<< refineYolov5sBoxesCpu time = 0.093012 ms
<<<<<< cudaMemcpyAsync time = 0.02022 ms
<<<<<< yolov5sNmsGpu time = 0.364955 ms
<<<<<< cudaMemset time = 1.14096 ms
<<<<<< cudaMemcpyAsync time = 0.010901 ms
<<<<<< enqueue inside all time = 23.0767 ms
不报错了之后,在里面增加了一些时间打印,太失望了,这个时间20多ms比还不如在cpu做,cpu上这个时间也不过才10ms,白忙活。
耗时最大的是cudaMemcpyAsync,
int Yolov5sPostProcess::enqueue(
int batchSize,
int boxNum,
int elemSize,
const void *const *inputs,
void *const *outputs
)
{
std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now();
const float* cpu_input = static_cast<const float*>(inputs[0]);
unsigned long long input_size = batchSize * boxNum * elemSize * sizeof(float);
cudaMemcpyAsync(raw_yolo_batch_data_, cpu_input, input_size, cudaMemcpyHostToDevice, stream_);
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< cudaMemcpyAsync time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count() * 1000 << " ms" << std::endl;
...
...
}
这里是因为client端返回的数据是在CPU上的,所以要拷贝到GPU上,然后再让核函数去做处理,可是memcpy耗时竟然这么久。
只能先这样了,等到时候放到server端,如果数据直接在GPU上,那么这样就会快一点了。
<< cudaMemcpyAsync decoded_yolo_data_host_ time = 0.914081 ms
<<<<<< refineYolov5sBoxesCpu time = 0.093012 ms
<<<<<< cudaMemcpyAsync time = 0.02022 ms
<<<<<< yolov5sNmsGpu time = 0.364955 ms
<<<<<< cudaMemset time = 1.14096 ms
<<<<<< cudaMemcpyAsync time = 0.010901 ms
<<<<<< enqueue inside all time = 23.0767 ms
不报错了之后,在里面增加了一些时间打印,太失望了,这个时间20多ms比还不如在cpu做,cpu上这个时间也不过才10ms,白忙活。
耗时最大的是cudaMemcpyAsync,
```c++
int Yolov5sPostProcess::enqueue(
int batchSize,
int boxNum,
int elemSize,
const void *const *inputs,
void *const *outputs
)
{
std::chrono::high_resolution_clock::time_point t0 = std::chrono::high_resolution_clock::now();
const float* cpu_input = static_cast<const float*>(inputs[0]);
unsigned long long input_size = batchSize * boxNum * elemSize * sizeof(float);
cudaMemcpyAsync(raw_yolo_batch_data_, cpu_input, input_size, cudaMemcpyHostToDevice, stream_);
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
std::cout << "<<<<<< cudaMemcpyAsync time = " << std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count() * 1000 << " ms" << std::endl;
...
...
}
这里是因为client端返回的数据是在CPU上的,所以要拷贝到GPU上,然后再让核函数去做处理,可是memcpy耗时竟然这么久。
只能先这样了,等到时候放到server端,如果数据直接在GPU上,那么这样就会快一点了。