Yolov8 pose关键点检测 C++ GPU部署：ONNXRuntime cuda部署-腾讯云开发者社区-腾讯云 (2024)

💡💡💡本文摘要：本文提供了YOLOv8 pose关键点检测 c++部署方式，ONNX Runtime CUDA和cpu部署

💡💡💡ONNX Runtime优点：通用性好，速度较快，适合各个平台复制；

💡💡💡下图为ONNX Runtime CUDA推理结果

Yolov8 pose关键点检测 C++ GPU部署：ONNXRuntime cuda部署-腾讯云开发者社区-腾讯云 (1)

1. ONNX和Tensorrt区别

ONNX Runtime 是将 ONNX 模型部署到生产环境的跨平台高性能运行引擎，主要对模型图应用了大量的图优化，然后基于可用的特定于硬件的加速器将其划分为子图（并行处理）。

ONNX的官方网站：https://onnx.ai/

ONXX的GitHub地址：https://github.com/onnx/onnx

Yolov8 pose关键点检测 C++ GPU部署：ONNXRuntime cuda部署-腾讯云开发者社区-腾讯云 (2)

1.2 Tensorrt介绍

C++ 库，用于加速 NVIDIA 的 GPU，可以为深度学习应用提供低延迟、高吞吐率的部署推理，支持 TensorFlow，Pytorch，Caffe2 ，Paddle等框架训练出的神经网络，可以优化网络计算TensorRT官网下载地址：https://developer.nvidia.com/zh-cn/tensorrt

开发者指南：https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html

Github地址：https://github.com/NVIDIA/TensorRT

1.3 Yolov8两种部署方式比较：

Tensorrt 优点：在GPU上推理速度是最快的；缺点：不同显卡cuda版本可能存在不适用情况；

ONNX Runtime优点：通用性好，速度较快，适合各个平台复制；

2.Yolov8 poseONNX Runtime部署

2.1 如何得到 .onnx

代码语言：javascript

复制

2.2 主函数代码：

yolov8onnxruntime.cpp

代码语言：txt

复制

#include <iostream>#include<opencv2/opencv.hpp>#include<math.h>#include "yolov8.h"#include "yolov8_onnx.h"#include "yolov8_pose_onnx.h"//#include "yolov8_seg_onnx.h"#include<time.h>//#define VIDEO_OPENCV //if define, use opencv for video.using namespace std;using namespace cv;using namespace dnn;template<typename _Tp>int yolov8(_Tp& task, cv::Mat& img, std::string& model_path){cv::dnn::Net net;if (task.ReadModel(net, model_path, true)) {std::cout << "read net ok!" << std::endl;}else {return -1;}//生成随机颜色std::vector<cv::Scalar> color;srand(time(0));for (int i = 0; i < 80; i++) {int b = rand() % 256;int g = rand() % 256;int r = rand() % 256;color.push_back(cv::Scalar(b, g, r));}std::vector<OutputParams> result;bool isPose = false;PoseParams poseParams;if (task.Detect(img, net, result)) {if (isPose)DrawPredPose(img, result, poseParams);elseDrawPred(img, result, task._className, color);}else {std::cout << "Detect Failed!" << std::endl;}system("pause");return 0;}template<typename _Tp>int yolov8_onnx(_Tp& task, cv::Mat& img, std::string& model_path){if (task.ReadModel(model_path, false)) {std::cout << "read net ok!" << std::endl;}else {return -1;}//生成随机颜色std::vector<cv::Scalar> color;srand(time(0));for (int i = 0; i < 80; i++) {int b = rand() % 256;int g = rand() % 256;int r = rand() % 256;color.push_back(cv::Scalar(b, g, r));}bool isPose = false;if (typeid(task) == typeid(Yolov8PoseOnnx)) {isPose = true;}PoseParams poseParams;std::vector<OutputParams> result;if (task.OnnxDetect(img, result)) {if (isPose)DrawPredPose(img, result, poseParams);elseDrawPred(img, result, task._className, color);}else {std::cout << "Detect Failed!" << std::endl;}system("pause");return 0;}int main() {std::string img_path = "D:/DL/AIDeploy/YOLOv8-Deploy/yolov8onnxruntime/model/bus.jpg";//std::string model_path_detect = "D:/DL/AIDeploy/YOLOv8-Deploy/yolov8onnxruntime/model/yolov8n.onnx";//std::string model_path_seg = "D:/DL/AIDeploy/YOLOv8-Deploy/yolov8onnxruntime/model/yolov8n-seg.onnx";std::string model_path_pose = "D:/DL/AIDeploy/YOLOv8-Deploy/yolov8onnxruntime/model/yolov8n-pose.onnx";//Yolov8SegOnnxtask_segment_ort;Yolov8PoseOnnxtask_pose_ort;cv::Mat src = imread(img_path);cv::Mat img = src.clone();yolov8_onnx(task_pose_ort, img, model_path_pose); //yolov8 onnxruntime pose//Yolov8task_detect_ocv;//Yolov8Onnxtask_detect_ort;//yolov8_onnx(task_detect_ort, img, model_path_detect); //yoolov8 onnxruntime detect//yolov8_onnx(task_segment_ort, img, model_path_seg); //yolov8 onnxruntime segmentreturn 0;}

yolov8_pose_onnx.cpp

代码语言：txt

复制

#include "yolov8_pose_onnx.h"//using namespace std;//using namespace cv;//using namespace cv::dnn;using namespace Ort;bool Yolov8PoseOnnx::ReadModel(const std::string& modelPath, bool isCuda, int cudaID, bool warmUp) {if (_batchSize < 1) _batchSize = 1;try{if (!CheckModelPath(modelPath))return false;std::vector<std::string> available_providers = GetAvailableProviders();auto cuda_available = std::find(available_providers.begin(), available_providers.end(), "CUDAExecutionProvider");if (isCuda && (cuda_available == available_providers.end())){std::cout << "Your ORT build without GPU. Change to CPU." << std::endl;std::cout << "************* Infer model on CPU! *************" << std::endl;}else if (isCuda && (cuda_available != available_providers.end())){std::cout << "************* Infer model on GPU! *************" << std::endl;#if ORT_API_VERSION < ORT_OLD_VISONOrtCUDAProviderOptions cudaOption;cudaOption.device_id = cudaID;_OrtSessionOptions.AppendExecutionProvider_CUDA(cudaOption);#elseOrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(_OrtSessionOptions, cudaID);#endif}else{std::cout << "************* Infer model on CPU! *************" << std::endl;}//_OrtSessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);#ifdef _WIN32std::wstring model_path(modelPath.begin(), modelPath.end());_OrtSession = new Ort::Session(_OrtEnv, model_path.c_str(), _OrtSessionOptions);#else_OrtSession = new Ort::Session(_OrtEnv, modelPath.c_str(), _OrtSessionOptions);#endifOrt::AllocatorWithDefaultOptions allocator;//init input_inputNodesNum = _OrtSession->GetInputCount();#if ORT_API_VERSION < ORT_OLD_VISON_inputName = _OrtSession->GetInputName(0, allocator);_inputNodeNames.push_back(_inputName);#else_inputName = std::move(_OrtSession->GetInputNameAllocated(0, allocator));_inputNodeNames.push_back(_inputName.get());#endif//std::cout << _inputNodeNames[0] << std::endl;Ort::TypeInfo inputTypeInfo = _OrtSession->GetInputTypeInfo(0);auto input_tensor_info = inputTypeInfo.GetTensorTypeAndShapeInfo();_inputNodeDataType = input_tensor_info.GetElementType();_inputTensorShape = input_tensor_info.GetShape();if (_inputTensorShape[0] == -1){_isDynamicShape = true;_inputTensorShape[0] = _batchSize;}if (_inputTensorShape[2] == -1 || _inputTensorShape[3] == -1) {_isDynamicShape = true;_inputTensorShape[2] = _netHeight;_inputTensorShape[3] = _netWidth;}//init output_outputNodesNum = _OrtSession->GetOutputCount();#if ORT_API_VERSION < ORT_OLD_VISON_output_name0 = _OrtSession->GetOutputName(0, allocator);_outputNodeNames.push_back(_output_name0);#else_output_name0 = std::move(_OrtSession->GetOutputNameAllocated(0, allocator));_outputNodeNames.push_back(_output_name0.get());#endifOrt::TypeInfo type_info_output0(nullptr);type_info_output0 = _OrtSession->GetOutputTypeInfo(0); //output0auto tensor_info_output0 = type_info_output0.GetTensorTypeAndShapeInfo();_outputNodeDataType = tensor_info_output0.GetElementType();_outputTensorShape = tensor_info_output0.GetShape();//_outputMaskNodeDataType = tensor_info_output1.GetElementType(); //the same as output0//_outputMaskTensorShape = tensor_info_output1.GetShape();//if (_outputTensorShape[0] == -1)//{//_outputTensorShape[0] = _batchSize;//_outputMaskTensorShape[0] = _batchSize;//}//if (_outputMaskTensorShape[2] == -1) {////size_t ouput_rows = 0;////for (int i = 0; i < _strideSize; ++i) {////ouput_rows += 3 * (_netWidth / _netStride[i]) * _netHeight / _netStride[i];////}////_outputTensorShape[1] = ouput_rows;//_outputMaskTensorShape[2] = _segHeight;//_outputMaskTensorShape[3] = _segWidth;//}//warm upif (isCuda && warmUp) {//draw runstd::cout << "Start warming up" << std::endl;size_t input_tensor_length = VectorProduct(_inputTensorShape);float* temp = new float[input_tensor_length];std::vector<Ort::Value> input_tensors;std::vector<Ort::Value> output_tensors;input_tensors.push_back(Ort::Value::CreateTensor<float>(_OrtMemoryInfo, temp, input_tensor_length, _inputTensorShape.data(),_inputTensorShape.size()));for (int i = 0; i < 3; ++i) {output_tensors = _OrtSession->Run(Ort::RunOptions{ nullptr },_inputNodeNames.data(),input_tensors.data(),_inputNodeNames.size(),_outputNodeNames.data(),_outputNodeNames.size());}delete[]temp;}}catch (const std::exception&) {return false;}return true;}int Yolov8PoseOnnx::Preprocessing(const std::vector<cv::Mat>& srcImgs, std::vector<cv::Mat>& outSrcImgs, std::vector<cv::Vec4d>& params) {outSrcImgs.clear();cv::Size input_size = cv::Size(_netWidth, _netHeight);for (int i = 0; i < srcImgs.size(); ++i) {cv::Mat temp_img = srcImgs[i];cv::Vec4d temp_param = {1,1,0,0};if (temp_img.size() != input_size) {cv::Mat borderImg;LetterBox(temp_img, borderImg, temp_param, input_size, false, false, true, 32);//std::cout << borderImg.size() << std::endl;outSrcImgs.push_back(borderImg);params.push_back(temp_param);}else {outSrcImgs.push_back(temp_img);params.push_back(temp_param);}}int lack_num = _batchSize- srcImgs.size();if (lack_num > 0) {for (int i = 0; i < lack_num; ++i) {cv::Mat temp_img = cv::Mat::zeros(input_size, CV_8UC3);cv::Vec4d temp_param = { 1,1,0,0 };outSrcImgs.push_back(temp_img);params.push_back(temp_param);}}return 0;}bool Yolov8PoseOnnx::OnnxDetect(cv::Mat& srcImg, std::vector<OutputParams>& output) {std::vector<cv::Mat> input_data = { srcImg };std::vector<std::vector<OutputParams>> tenp_output;if (OnnxBatchDetect(input_data, tenp_output)) {output = tenp_output[0];return true;}else return false;}bool Yolov8PoseOnnx::OnnxBatchDetect(std::vector<cv::Mat>& srcImgs, std::vector<std::vector<OutputParams>>& output) {std::vector<cv::Vec4d> params;std::vector<cv::Mat> input_images;cv::Size input_size(_netWidth, _netHeight);//preprocessingPreprocessing(srcImgs, input_images, params);cv::Mat blob = cv::dnn::blobFromImages(input_images, 1 / 255.0, input_size, cv::Scalar(0, 0, 0), true, false);int64_t input_tensor_length = VectorProduct(_inputTensorShape);std::vector<Ort::Value> input_tensors;std::vector<Ort::Value> output_tensors;input_tensors.push_back(Ort::Value::CreateTensor<float>(_OrtMemoryInfo, (float*)blob.data, input_tensor_length, _inputTensorShape.data(), _inputTensorShape.size()));output_tensors = _OrtSession->Run(Ort::RunOptions{ nullptr },_inputNodeNames.data(),input_tensors.data(),_inputNodeNames.size(),_outputNodeNames.data(),_outputNodeNames.size());//post-processfloat* all_data = output_tensors[0].GetTensorMutableData<float>();_outputTensorShape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();int net_width = _outputTensorShape[1];int key_point_length = net_width - 5;int key_point_num = 17; //_bodyKeyPoints.size(), shape[x, y, confidence]if (key_point_num * 3 != key_point_length) {std::cout << "Pose should be shape [x, y, confidence] with 17-points" << std::endl;return false;}int64_t one_output_length = VectorProduct(_outputTensorShape) / _outputTensorShape[0];for (int img_index = 0; img_index < srcImgs.size(); ++img_index) {cv::Mat output0 = cv::Mat(cv::Size((int)_outputTensorShape[2], (int)_outputTensorShape[1]), CV_32F, all_data).t(); //[bs,116,8400]=>[bs,8400,116]all_data += one_output_length;float* pdata = (float*)output0.data;int rows = output0.rows;std::vector<int> class_ids;//结果id数组std::vector<float> confidences;//结果每个id对应置信度数组std::vector<cv::Rect> boxes;//每个id矩形框std::vector<std::vector<PoseKeyPoint>> pose_key_points; //保存kptfor (int r = 0; r < rows; ++r) { float max_class_socre= pdata[4];if (max_class_socre >= _classThreshold) {//rect [x,y,w,h]float x = (pdata[0] - params[img_index][2]) / params[img_index][0]; //xfloat y = (pdata[1] - params[img_index][3]) / params[img_index][1]; //yfloat w = pdata[2] / params[img_index][0]; //wfloat h = pdata[3] / params[img_index][1]; //hint left = MAX(int(x - 0.5 * w + 0.5), 0);int top = MAX(int(y - 0.5 * h + 0.5), 0);class_ids.push_back(0);confidences.push_back(max_class_socre);boxes.push_back(cv::Rect(left, top, int(w + 0.5), int(h + 0.5)));std::vector<PoseKeyPoint> temp_kpts;for (int kpt = 0; kpt < key_point_length; kpt += 3) {PoseKeyPoint temp_kp;temp_kp.x = (pdata[5 + kpt]- params[img_index][2]) / params[img_index][0];temp_kp.y = (pdata[6 + kpt] - params[img_index][3]) / params[img_index][1];temp_kp.confidence = pdata[7 + kpt];temp_kpts.push_back(temp_kp);}pose_key_points.push_back(temp_kpts);}pdata += net_width;//下一行}std::vector<int> nms_result;cv::dnn::NMSBoxes(boxes, confidences, _classThreshold, _nmsThreshold, nms_result);std::vector<std::vector<float>> temp_mask_proposals;cv::Rect holeImgRect(0, 0, srcImgs[img_index].cols, srcImgs[img_index].rows);std::vector<OutputParams> temp_output;for (int i = 0; i < nms_result.size(); ++i) {int idx = nms_result[i];OutputParams result;result.id = class_ids[idx];result.confidence = confidences[idx];result.box = boxes[idx] & holeImgRect;result.keyPoints = pose_key_points[idx];temp_output.push_back(result);}output.push_back(temp_output);}if (output.size())return true;elsereturn false;}

yolov8_pose_onnx.h

代码语言：txt

复制

#pragma once#include <iostream>#include<memory>#include <opencv2/opencv.hpp>#include "yolov8_utils.h"#include<onnxruntime_cxx_api.h>//#include <tensorrt_provider_factory.h> //if use OrtTensorRTProviderOptionsV2//#include <onnxruntime_c_api.h>class Yolov8PoseOnnx {public:Yolov8PoseOnnx() :_OrtMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator, OrtMemType::OrtMemTypeCPUOutput)) {};~Yolov8PoseOnnx() {if (_OrtSession != nullptr)delete _OrtSession;};// delete _OrtMemoryInfo;public:/** \brief Read onnx-model* \param[in] modelPath:onnx-model path* \param[in] isCuda:if true,use Ort-GPU,else run it on cpu.* \param[in] cudaID:if isCuda==true,run Ort-GPU on cudaID.* \param[in] warmUp:if isCuda==true,warm up GPU-model.*/bool ReadModel(const std::string& modelPath, bool isCuda = false, int cudaID = 0, bool warmUp = true);/** \brief detect.* \param[in] srcImg:a 3-channels image.* \param[out] output:detection results of input image.*/bool OnnxDetect(cv::Mat& srcImg, std::vector<OutputParams>& output);/** \brief detect,batch size= _batchSize* \param[in] srcImg:A batch of images.* \param[out] output:detection results of input images.*/bool OnnxBatchDetect(std::vector<cv::Mat>& srcImg, std::vector<std::vector<OutputParams>>& output);private:template <typename T>T VectorProduct(const std::vector<T>& v){return std::accumulate(v.begin(), v.end(), 1, std::multiplies<T>());};int Preprocessing(const std::vector<cv::Mat>& srcImgs, std::vector<cv::Mat>& outSrcImgs, std::vector<cv::Vec4d>& params);const int _netWidth = 640; //ONNX-net-input-widthconst int _netHeight = 640; //ONNX-net-input-heightint _batchSize = 1; //if multi-batch,set thisbool _isDynamicShape = false;//onnx support dynamic shapefloat _classThreshold = 0.25;float _nmsThreshold = 0.45;//ONNXRUNTIMEOrt::Env _OrtEnv = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "Yolov8");Ort::SessionOptions _OrtSessionOptions = Ort::SessionOptions();Ort::Session* _OrtSession = nullptr;Ort::MemoryInfo _OrtMemoryInfo;#if ORT_API_VERSION < ORT_OLD_VISONchar* _inputName, * _output_name0;#elsestd::shared_ptr<char> _inputName, _output_name0;#endifstd::vector<char*> _inputNodeNames; //输入节点名std::vector<char*> _outputNodeNames;//输出节点名size_t _inputNodesNum = 0; //输入节点数size_t _outputNodesNum = 0; //输出节点数ONNXTensorElementDataType _inputNodeDataType; //数据类型ONNXTensorElementDataType _outputNodeDataType;std::vector<int64_t> _inputTensorShape; //输入张量shapestd::vector<int64_t> _outputTensorShape;public:std::vector<std::string> _className = {"person"};};

Yolov8 pose关键点检测 C++ GPU部署：ONNXRuntime cuda部署-腾讯云开发者社区-腾讯云 (2024)

1. ONNX和Tensorrt区别

1.2 Tensorrt介绍

1.3 Yolov8两种部署方式比较：

2.Yolov8 poseONNX Runtime部署

2.1 如何得到 .onnx

2.2 主函数代码：

References