本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《 阿里云开发者社区用户服务协议 》和 《 阿里云开发者社区知识产权保护指引 》。如果您发现本社区中有涉嫌抄袭的内容,填写 侵权投诉表单 进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。

5 TensorRT部署


5.1 目标检测常见的落地形式

image.png

1、TensorRT是什么

TensorRT是推理优化器,能对训练好的模型进行优化。可以理解为只有前向传播的深度学习框架,这个框架可以将Caffe,TensorFlow的网络模型解析,然后与TensorRT中对应的层进行一一映射,把其他框架的模型统一全部转换到TensorRT中,然后在TensorRT中可以针对NVIDIA自家GPU实施优化策略,并进行部署加速。当你的网络训练完之后,可以将训练模型文件直接丢进TensorRT中,而不再需要依赖深度学习框架(Caffe,TensorFlow等)。

2、本文AI部署流程

先把onnx转化为TensorRT的Engine文件,然后让c++环境下的TensorRT直接加载Engine文件,从而构建engine,本文主要讲解onnx转换至Engine,然后进行基于TensorRT的C++推理检测。

image

转换和部署模型5个基本步骤:

  • step1:获取模型
  • step2:选择batchsize
  • step3:选择精度
  • step4:模型转换
  • step5:模型部署

5.2 ONNX-TensorRT的部署流程

1、ONNX转化为TRT Engine

# 导出onnx文件
python export.py ---weights weights/v5lite-g.pt --batch-size 1 --imgsz 640 --include onnx --simplify
# 使用TensorRT官方的trtexec工具将onnx文件转换为engine
trtexec --explicitBatch --onnx=./v5lite-g.onnx --saveEngine=v5lite-g.trt --fp16

闲话不多说,这里已经拿到了trt的engine,那么如何进行推理呢?总的来说,分为3步:

  1. 首先load你的engine,拿到一个ICudaEngine, 这个是TensorRT推理的核心;
  2. 定位模型的输入和输出,有几个输入有几个输出;
  3. forward模型,然后拿到输出,对输出进行后处理。

当然这里最核心的东西其实就两个,一个是如何导入拿到CudaEngine,第二个是比较麻烦的后处理。

2、加载TRT Engine

bool Model::readTrtFile() {
    std::string cached_engine;
    std::fstream file;
    std::cout << "loading filename from:" << engine_file << std::endl;
    nvinfer1::IRuntime *trtRuntime;
    file.open(engine_file, std::ios::binary | std::ios::in);
    if (!file.is_open()) {
        std::cout << "read file error: " << engine_file << std::endl;
        cached_engine = "";
    while (file.peek() != EOF) {
        std::stringstream buffer;
        buffer << file.rdbuf();
        cached_engine.append(buffer.str());
    file.close();
    trtRuntime = nvinfer1::createInferRuntime(gLogger.getTRTLogger());
    engine = trtRuntime->deserializeCudaEngine(cached_engine.data(), cached_engine.size(), nullptr);
    std::cout << "deserialize done" << std::endl;
// 加载 TensorRT Engine
void v5Lite::LoadEngine() {
    // create and load engine
    std::fstream existEngine;
    existEngine.open(engine_file, std::ios::in);
    // 如果存在已经转换完成的TensorRT Engine文件,则直接加载
    if (existEngine) {
        readTrtFile(engine_file, engine);
        assert(engine != nullptr);
    // 如果不存在已经转换完成的TensorRT Engine文件,则直接加载ONNX权重进行在线生成
    else {
        onnxToTRTModel(onnx_file, engine_file, engine, BATCH_SIZE);
        assert(engine != nullptr);
}

3、后处理之坐标转换

image.png YOLOv5的坐标转换方法

std::vector<std::vector<V5lite::DetectRes>> V5lite::postProcess(const std::vector<cv::Mat> &vec_Mat, float *output,
                                                                const int &outSize) {
    std::vector<std::vector<DetectRes>> vec_result;
    int index = 0;
    for (const cv::Mat &src_img : vec_Mat)
        std::vector<DetectRes> result;
        float ratio = float(src_img.cols) / float(IMAGE_WIDTH) > float(src_img.rows) / float(IMAGE_HEIGHT)  ? float(src_img.cols) / float(IMAGE_WIDTH) : float(src_img.rows) / float(IMAGE_HEIGHT);
        float *out = output + index * outSize;
        int position = 0;
        for (int n = 0; n < (int)grids.size(); n++)
            for (int c = 0; c < grids[n][0]; c++)
                std::vector<int> anchor = anchors[n * grids[n][0] + c];
                for (int h = 0; h < grids[n][1]; h++)
                    for (int w = 0; w < grids[n][2]; w++)
                        float *row = out + position * (CATEGORY + 5);
                        position++;
                        DetectRes box;
                        auto max_pos = std::max_element(row + 5, row + CATEGORY + 5);
                        box.prob = row[4] * row[max_pos - row];
                        if (box.prob < obj_threshold)
                            continue;
                        box.classes = max_pos - row - 5;
                        // 坐标的反参数化,和前文的坐标转换对接
                        box.x = (row[0] * 2 - 0.5 + w) / grids[n][2] * IMAGE_WIDTH * ratio;
                        box.y = (row[1] * 2 - 0.5 + h) / grids[n][1] * IMAGE_HEIGHT * ratio;
                        box.w = pow(row[2] * 2, 2) * anchor[0] * ratio;
                        box.h = pow(row[3] * 2, 2) * anchor[1] * ratio;
                        result.push_back(box);
        NmsDetect(result);
        vec_result.push_back(result);
        index++;
    return vec_result;
}

4、进行模型推理

// 推理整个文件夹的文件
bool YOLOv5::InferenceFolder(const std::string &folder_name) {
    // 读取文件夹下面的文件,并返回为一个 string vector 迭代器
    std::vector<std::string> sample_images = readFolder(folder_name);
    //get context
    assert(engine != nullptr);
    // 创建上下文,创建一些空间来存储中间值。一个engine可以创建多个context,分别执行多个推理任务。
    context = engine->createExecutionContext();
    assert(context != nullptr);
    // 传递给Engine的输入输出buffers指针,这里对应一个输入和一个输出
    assert(engine->getNbBindings() == 2);
    void *buffers[2];
    std::vector<int64_t> bufferSize;
    int nbBindings = engine->getNbBindings();
    bufferSize.resize(nbBindings);
    for (int i = 0; i < nbBindings; ++i) {
        // 获取输入或输出的维度信息
        nvinfer1::Dims dims = engine->getBindingDimensions(i);
        // 获取输入或输出的数据类型信息
        nvinfer1::DataType dtype = engine->getBindingDataType(i);
        int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);
        bufferSize[i] = totalSize;
        std::cout << "binding" << i << ": " << totalSize << std::endl;
        // &buffers是双重指针 相当于改变指针本身,这里就是把输入或输出进行向量化操作
        cudaMalloc(&buffers[i], totalSize);
    //get stream
    cudaStream_t stream;
    // 创建 Stream
    cudaStreamCreate(&stream);
    int outSize = bufferSize[1] / sizeof(float) / BATCH_SIZE;
    // 执行推理
    EngineInference(sample_images, outSize, buffers, bufferSize, stream);
    // 释放 stream 和 buffers
    cudaStreamDestroy(stream);
    cudaFree(buffers[0]);
    cudaFree(buffers[1]);
    // destroy the engine
    context->destroy();
    engine->destroy();
void YOLOv5::EngineInference(const std::vector<std::string> &image_list, const int &outSize, void **buffers,
                             const std::vector<int64_t> &bufferSize, cudaStream_t stream) {
    int index = 0;
    int batch_id = 0;
    std::vector<cv::Mat> vec_Mat(BATCH_SIZE);
    std::vector<std::string> vec_name(BATCH_SIZE);
    float total_time = 0;
    // 遍历图像路径list
    for (const std::string &image_name : image_list)
        index++;
        std::cout << "Processing: " << image_name << std::endl;
        // 读取图像内容到cv_mat
        cv::Mat src_img = cv::imread(image_name);
        // 把图像和图像名分别保存在vec_Mat和vec_name之中
        if (src_img.data)
            vec_Mat[batch_id] = src_img.clone();
            vec_name[batch_id] = image_name;
            batch_id++;
        if (batch_id == BATCH_SIZE or index == image_list.size())
            // 声明时间戳 t_start_pre
            auto t_start_pre = std::chrono::high_resolution_clock::now();
            std::cout << "prepareImage" << std::endl;
            std::vector<float>curInput = prepareImage(vec_Mat);
            auto t_end_pre = std::chrono::high_resolution_clock::now();
            // 至此,prepare Image的时间已经计算完成
            float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
            std::cout << "prepare image take: " << total_pre << " ms." << std::endl;
            total_time += total_pre;
            batch_id = 0;
            if (!curInput.data()) {
                std::cout << "prepare images ERROR!" << std::endl;
                continue;
            // 将数据从CPU端传送到GPU端
            std::cout << "host2device" << std::endl;
            cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream);
            // 执行推理
            std::cout << "execute" << std::endl;
            auto t_start = std::chrono::high_resolution_clock::now();
            context->execute(BATCH_SIZE, buffers);
            auto t_end = std::chrono::high_resolution_clock::now();
            float total_inf = std::chrono::duration<float, std::milli>(t_end - t_start).count();
            std::cout << "Inference take: " << total_inf << " ms." << std::endl;
            total_time += total_inf;
            std::cout << "execute success" << std::endl;
            std::cout << "device2host" << std::endl;
            std::cout << "post process" << std::endl;
            auto r_start = std::chrono::high_resolution_clock::now();
            auto *out = new float[outSize * BATCH_SIZE];
            // Copy GPU端的推理结果到CPU端
            cudaMemcpyAsync(out, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream);
            // 阻塞当前程序的执行,直到所有任务都处理完毕,这样可以将计算和主机与设备之前的传输并行化,提高效率。
            cudaStreamSynchronize(stream);
            // 进行后处理操作
            auto boxes = postProcess(vec_Mat, out, outSize);
            auto r_end = std::chrono::high_resolution_clock::now();
            float total_res = std::chrono::duration<float, std::milli>(r_end - r_start).count();
            std::cout << "Post process take: " << total_res << " ms." << std::endl;
            total_time += total_res;
            for (int i = 0; i < (int)vec_Mat.size(); i++)
                auto org_img = vec_Mat[i];
                if (!org_img.data)
                    continue;
                auto rects = boxes[i];
                for(const auto &rect : rects)
                    char t[256];
                    sprintf(t, "%.2f", rect.prob);
                    std::string name = coco_labels[rect.classes] + "-" + t;
                    // 图书添加文字
                    cv::putText(org_img, name, cv::Point(rect.x - rect.w / 2, rect.y - rect.h / 2 - 5), cv::FONT_HERSHEY_COMPLEX, 0.7, class_colors[rect.classes], 2);
                    // 绘制矩形框
                    cv::Rect rst(rect.x - rect.w / 2, rect.y - rect.h / 2, rect.w, rect.h);
                    cv::rectangle(org_img, rst, class_colors[rect.classes], 2, cv::LINE_8, 0);
                int pos = vec_name[i].find_last_of(".");
                std::string rst_name = vec_name[i].insert(pos, "_");
                std::cout << rst_name << std::endl;
                // 保存检测结果
                cv::imwrite(rst_name, org_img);
            vec_Mat = std::vector<cv::Mat>(BATCH_SIZE);
            delete[] out;
    std::cout << "Average processing time is " << total_time / image_list.size() << "ms" << std::endl;
}

5、后处理之NMS C++实现

void V5lite::NmsDetect(std::vector<DetectRes> &detections) {
    sort(detections.begin(), detections.end(), [=](const DetectRes &left, const DetectRes &right) {
        return left.prob > right.prob;
    for (int i = 0; i < (int)detections.size(); i++)
        for (int j = i + 1; j < (int)detections.size(); j++)
            if (detections[i].classes == detections[j].classes)
            {   // 计算DIoU的值
                float iou = IOUCalculate(detections[i], detections[j]);
                if (iou > nms_threshold)
                    detections[j].prob = 0;
    detections.erase(std::remove_if(detections.begin(), detections.end(), [](const DetectRes &det)
    { return det.prob == 0; }), detections.end());
// 计算 DIOU
float v5Lite::IOUCalculate(const YOLOv5::DetectRes &det_a, const YOLOv5::DetectRes &det_b) {
    cv::Point2f center_a(det_a.x, det_a.y);
    cv::Point2f center_b(det_b.x, det_b.y);
    // 计算左上角角点坐标
    cv::Point2f left_up(std::min(det_a.x - det_a.w / 2, det_b.x - det_b.w / 2),
                        std::min(det_a.y - det_a.h / 2, det_b.y - det_b.h / 2));
    // 计算右下角角点坐标
    cv::Point2f right_down(std::max(det_a.x + det_a.w / 2, det_b.x + det_b.w / 2),
                           std::max(det_a.y + det_a.h / 2, det_b.y + det_b.h / 2));
    // 计算框的中心点距离
    float distance_d = (center_a - center_b).x * (center_a - center_b).x + (center_a - center_b).y * (center_a - center_b).y;
    // 计算框的角点距离
    float distance_c = (left_up - right_down).x * (left_up - right_down).x + (left_up - right_down).y * (left_up - right_down).y;
    float inter_l = det_a.x - det_a.w / 2 > det_b.x - det_b.w / 2 ? det_a.x - det_a.w / 2 : det_b.x - det_b.w / 2;
    float inter_t = det_a.y - det_a.h / 2 > det_b.y - det_b.h / 2 ? det_a.y - det_a.h / 2 : det_b.y - det_b.h / 2;
    float inter_r = det_a.x + det_a.w / 2 < det_b.x + det_b.w / 2 ? det_a.x + det_a.w / 2 : det_b.x + det_b.w / 2;
    float inter_b = det_a.y + det_a.h / 2 < det_b.y + det_b.h / 2 ? det_a.y + det_a.h / 2 : det_b.y + det_b.h / 2;
    if (inter_b < inter_t || inter_r < inter_l)
        return 0;
    // 计算交集
    float inter_area = (inter_b - inter_t) * (inter_r - inter_l);
    // 计算并集
    float union_area = det_a.w * det_a.h + det_b.w * det_b.h - inter_area;
    if (union_area == 0)
        return 0;
        return inter_area / union_area - distance_d / distance_c;
}

CMakeLists.txt如下:

cmake_minimum_required(VERSION 3.5)
project(v5lite_trt)
set(CMAKE_CXX_STANDARD 14)
# CUDA
find_package(CUDA REQUIRED)
message(STATUS "Find CUDA include at ${CUDA_INCLUDE_DIRS}")
message(STATUS "Find CUDA libraries: ${CUDA_LIBRARIES}")
# TensorRT
set(TENSORRT_ROOT "/home/chaucer/TensorRT-8.0.1.6")
find_path(TENSORRT_INCLUDE_DIR NvInfer.h
        HINTS ${TENSORRT_ROOT} PATH_SUFFIXES include/)
message(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
find_library(TENSORRT_LIBRARY_INFER nvinfer
        HINTS ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
        PATH_SUFFIXES lib lib64 lib/x64)
find_library(TENSORRT_LIBRARY_ONNXPARSER nvonnxparser
        HINTS  ${TENSORRT_ROOT} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
        PATH_SUFFIXES lib lib64 lib/x64)
set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_ONNXPARSER})
message(STATUS "Find TensorRT libs: ${TENSORRT_LIBRARY}")
# OpenCV
find_package(OpenCV REQUIRED)
message(STATUS "Find OpenCV include at ${OpenCV_INCLUDE_DIRS}")
message(STATUS "Find OpenCV libraries: ${OpenCV_LIBRARIES}")
set(COMMON_INCLUDE ./includes/common)
set(YAML_INCLUDE ./includes/yaml-cpp/include)
set(YAML_LIB_DIR ./includes/yaml-cpp/libs)
include_directories(${CUDA_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${OpenCV_INCLUDE_DIRS} ${COMMON_INCLUDE} ${YAML_INCLUDE})
link_directories(${YAML_LIB_DIR})
add_executable(v5lite_trt main.cpp v5lite.cpp)
target_link_libraries(v5lite_trt ${OpenCV_LIBRARIES} ${CUDA_LIBRARIES} ${TENSORRT_LIBRARY} yaml-cpp)
mkdir build
cd build
cmake ..
make -j8
v5lite_trt ../config.yaml ../samples/

5、检测结果和时间

image image


6 参考


[1].https://github.com/ppogg/YOLOv5-Lite

[2].https://zhuanlan.zhihu.com/p/400545131

[3].https://github.com/ultralytics/yolov5

[4].https://zhuanlan.zhihu.com/p/172121380

[5].https://zhuanlan.zhihu.com/p/143747206

云原生AI套件:一键训练大模型及部署GPU共享推理服务 云原生AI套件:五分钟微调宝可梦风格StableDiffusion 使用PAI-EAS一键部署ChatGLM及LangChain应用 在云上部署ChatGLM2-6B大模型(GPU版) 热门Stable Diffusion模型库快速转存至个人NAS与加载训练 【文生图】可换模型,函数计算一键部署 Stable Diffusion