WIN10+WSL+UBUNTU2004+mmdeploy+3090

送粽鸡

帅气的送粽鸡

本文记述在win10系统下，使用wsl2虚拟系统ubuntu2004，安装mmdeploy，训练mmseg，转换pth模型，使用python和c++分别在onnx和tensorrt下部署的经历。

环境

系统：win10

显卡：3090

ubuntu：20.04

软件

cuda：11.6 update2

nvidia-driver：511.65

cudnn: 8.6.0

tensorRT: TensorRT-8.5.3.1

onnx：1.8.1

重点注意

win系统中的cuda最好和ubuntu系统的cuda保持一致，不一致可以部署成功，但是转换tensorrt会出现结果和onnx不一致的情况
30系列显卡，nvidia-driver最好在470以上，wsl才能直接使用驱动

win10端操作

升级wsl2

下载最新包：适用于 x64 计算机的 WSL2 Linux 内核更新包
运行上一步中下载的更新包。

将 WSL 2 设置为默认版本
打开 PowerShell，然后在安装新的 Linux 发行版时运行以下命令，将 WSL 2 设置为默认版本：

wsl --set-default-version 2

安装所选的 Linux 分发

打开 Microsoft Store，并选择你偏好的 Linux 分发版。安装好后最好进入系统， 查看一下nvidia-smi，可以直接执行，查看PATH目录，并记录你的cuda系统变量

转移存储目录

默认的虚拟存储在c盘，非常占内存，一定要转移

wsl -l -v # 显示已经安装的虚拟机
wsl --shutdown # 停止所有虚拟机
wsl -l -v # 停止所有虚拟机
wsl --export Ubuntu-20.04  E:\u2004.rar # 将名字为Ubuntu-20.04的虚拟机系统导出，导出文件名为u2004.rar
wsl --unregister Ubuntu-20.04 # 注销名字为Ubuntu-20.04的虚拟机
wsl -l -v # 显示已经安装的虚拟机
wsl --import Ubuntu-20.04  E:\Ubuntu2004 E:\u2004.rar # 重新导入名字为Ubuntu-20.04的虚拟机，并存储在E盘
wsl -l -v # 显示已经安装的虚拟机

UBUNTU端

设置环境变量

进入系统后先设置环境变量

export PATH='/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/usr/lib/wsl/lib:/mnt/c/Program Files/WindowsApps/CanonicalGroupLimited.Ubuntu20.04LTS_2004.6.16.0_x64__79rhkp1fndgsc':$PATH
export PATH='/mnt/c/Program Files (x86)/Common Files/Intel/Shared Libraries/redist/intel64/compiler:/mnt/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.6/bin:/mnt/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.6/libnvvp:/mnt/c/Windows/system32:/mnt/c/Windows:/mnt/c/Windows/System32/Wbem:/mnt/c/Windows/System32/WindowsPowerShell/v1.0/:/mnt/c/Windows/System32/OpenSSH/':$PATH
export PATH='/mnt/c/Program Files/NVIDIA Corporation/NVIDIA NvDLISR:/mnt/c/Program Files (x86)/NVIDIA Corporation/PhysX/Common:/mnt/c/Program Files/Git/cmd:/mnt/c/WINDOWS/system32:/mnt/c/WINDOWS:/mnt/c/WINDOWS/System32/Wbem:/mnt/c/WINDOWS/System32/WindowsPowerShell/v1.0/:/mnt/c/WINDOWS/System32/OpenSSH/':$PATH
export PATH='/mnt/c/Program Files/NVIDIA Corporation/Nsight Compute 2022.1.1//mnt/c/Users/H/AppData/Local/Microsoft/WindowsApps:/mnt/c/Users/H/AppData/Local/Programs/Microsoft VS Code/bin:/snap/bin':$PATH
export PATH='/mnt/c/Program Files/NVIDIA Corporation/NVSMI':$PATH
export PATH=~/anaconda3/envs/mmdep/bin:~/anaconda3/bin:~/anaconda3/condabin:$PATH
export PATH=~/Code/cmake-3.20.0-linux-x86_64/bin/:$PATH

然后执行nvidia-smi
nvcc 在/usr/loca/cuda/bin下，可以直接使用

查看是否有显卡信息

软件

Anaconda3-2023.03-0-Linux-x86_64 直接下载安装
cudnn 8.6
cuda_11.6.2_510.47.03_linux
tensorrt 8.5.3.1

conda环境

创建环境

conda create -n mmdeploy python=3.7 -y
conda activate mmdeploy
pip install numpy==1.21.6 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install torch==1.12.0+cu116 torchvision==0.13.0+cu116 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu116 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install onnxruntime==1.8.1 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install onnxruntime-gpu==1.8.1 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install -U openmim  -i https://pypi.tuna.tsinghua.edu.cn/simple/
mim install mmengine  -i https://pypi.tuna.tsinghua.edu.cn/simple/
mim install "mmcv>=2.0.1" -i https://pypi.tuna.tsinghua.edu.cn/simple/
sudo apt-get install libspdlog-dev
sudo apt-get install libopencv-dev

ppl.cv

git clone https://github.com/openppl-public/ppl.cv.git
cd ppl.cv
export PPLCV_DIR=$(pwd)
git checkout tags/v0.7.1 -b v0.7.1
./build.sh cuda

tensorrt

tar -zxvf  TensorRT-8.5.3.1.Linux.x86_64-gnu.cuda-11.8.cudnn8.6.tar.gz 
pip install TensorRT-8.5.3.1/python/tensorrt-8.5.3.1-cp37-none-linux_x86_64.whl 
cd TensorRT-8.5.3.1/
export TENSORRT_DIR=$(pwd)
export LD_LIBRARY_PTH=$TENSORRT_DIR/lib:$LD_LIBRARY_PATH
cd uff/
pip install uff-0.6.9-py2.py3-none-any.whl 
cd ..
cd graphsurgeon/
pip install graphsurgeon-0.4.6-py2.py3-none-any.whl

onnxruntime

wget https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz
tar -zxvf onnxruntime-linux-x64-1.8.1.tgz
cd onnxruntime-linux-x64-1.8.1
export ONNXRUNTIME_DIR=$(pwd)
export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH

cudnn

tar -xcJf cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz  
export CUDNN_DIR=$(pwd)/cudnn-linux-x86_64-8.6.0.163_cuda11-archive
export LD_LIBRARY_PATH=$CUDNN_DIR/lib:$LD_LIBRARY_PATH

mmsegmentation

git clone https://github.com/open-mmlab/mmsegmentation.git
cd mmsegmentation
pip install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple

mmdeploy

这里编译onnx和tensorrt的支持

git clone --recursive https://github.com/open-mmlab/mmdeploy.git
cd mmdeploy
mkdir build
cd build
cmake .. \
    -DCMAKE_CXX_COMPILER=g++-9 \
    -DMMDEPLOY_BUILD_SDK=ON \
    -DMMDEPLOY_BUILD_EXAMPLES=ON \
    -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \
    -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \
    -DMMDEPLOY_TARGET_BACKENDS="trt;ort" \
    -Dpplcv_DIR=${PPLCV_DIR}/cuda-build/install/lib/cmake/ppl \
    -DTENSORRT_DIR=${TENSORRT_DIR} \
    -DCUDNN_DIR=${CUDNN_DIR} \
    -DONNXRUNTIME_DIR=${ONNXRUNTIME_DIR}
make -j$(nproc) && make install
cd ..
# pip install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple
mim install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple

如果找不到cuda.h，导入头文件路径

export CPLUS_INCLUDE_PATH=/usr/local/cuda-11.6/targets/x86_64-linux/include/

如果有路径找不到，设置一下环境变量

export CUDA_DIR=/usr/local/cuda
export TENSORRT_DIR=~/Code/TensorRT-8.5.3.1
export ONNXRUNTIME_DIR=~/Code/onnxruntime-linux-x64-1.8.1
export CUDNN_DIR=~/Code/cudnn-linux-x86_64-8.6.0.163_cuda11-archive
export MMDeploy_DIR=~/Code/mmdeploy
export PPLCV_DIR=~/Code/ppl.cv
export LD_LIBRARY_PATH=$PPLCV_DIR/cuda-build/install/lib:$MMDeploy_DIR/build/lib:$CUDA_DIR/lib64:/$CUDNN_DIR/lib:$TENSORRT_DIR/lib:$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH

python api和sdk测试

先转换模型，这个是转换为onnx的脚本

export PYTHONPATH=~/Code/mmdeploy/build/lib
python mmdeploy/tools/deploy.py  ~/Code/mmdeploy/configs/mmseg/segmentation_onnxruntime_dynamic.py  ~/Code/mmsegmentation/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py ~/Code/mmsegmentation/pretrained/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth  ~/Code/mmsegmentation/demo/demo.png --work-dir ~/Code/mmsegmentation/onnx/ --device cuda --dump-info

这里注意，一定要导入路径PYTHONPATH=~/Code/mmdeploy/build/lib，不然python会报找不到mmdeploy的包

python 文件，里面包含了onnx和tensorrt的转换和测试

import os
home = '/home/jason'
image = '%s/Code/mmsegmentation/demo/demo.png'%(home)
device = 'cuda'
cmd = 'cd ~/Code && python mmdeploy/tools/deploy.py  \
      %s/Code/mmdeploy/configs/mmseg/segmentation_onnxruntime_dynamic.py  \
      %s/Code/mmsegmentation/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py \
      %s/Code/mmsegmentation/pretrained/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth  \
      %s/Code/mmsegmentation/demo/demo.png \
      --work-dir %s/Code/mmsegmentation/onnx/ --device cpu --dump-info'%(home,home,home,home,home)
pipe = os.popen(cmd)
print(pipe.read())
from mmdeploy.apis.utils import build_task_processor
from mmdeploy.utils import get_input_shape, load_config
import torch
deploy_cfg = '%s/Code/mmdeploy/configs/mmseg/segmentation_onnxruntime_dynamic.py'%(home)
model_cfg = '%s/Code/mmsegmentation/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py'%(home)
backend_model = ['%s/Code/mmsegmentation/trt/end2end.onnx'%(home)]
deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
task_processor = build_task_processor(model_cfg, deploy_cfg, 'cpu')
model = task_processor.build_backend_model(backend_model)
input_shape = get_input_shape(deploy_cfg)
model_inputs, _ = task_processor.create_input(image, input_shape)
with torch.no_grad():
    result = model.test_step(model_inputs)
print(result)
task_processor.visualize( image=image, model=model, result=result[0], window_name='visualize', output_file='./output_segmentation_onnx.png')
cmd = 'cd ~/Code && python mmdeploy/tools/deploy.py  \
  %s/Code/mmdeploy/configs/mmseg/segmentation_tensorrt_dynamic-512x1024-2048x2048.py  \
  %s/Code/mmsegmentation/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py \
  %s/Code/mmsegmentation/pretrained/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth  \
  %s/Code/mmsegmentation/demo/demo.png \
  --work-dir %s/Code/mmsegmentation/trt/ --device cuda --dump-info --log-level INFO '%(home,home,home,home,home)
print(cmd)
pipe = os.popen(cmd)
print(pipe.read())
print('-------------------- start inference --------------------------')
from mmdeploy_runtime import Segmentor
import cv2
import numpy as np
img = cv2.imread(image)
# create a classifier
segmentor = Segmentor(model_path='%s/Code/mmsegmentation/trt/'%(home), device_name='cuda', device_id=0)
# perform inference
seg = segmentor(img)
# visualize inference result
## random a palette with size 256x3
palette = np.random.randint(0, 256, size=(256, 3))
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
for label, color in enumerate(palette):
  color_seg[seg == label, :] = color
# convert to BGR
color_seg = color_seg[..., ::-1]
img = img * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)
cv2.imwrite('output_segmentation_trt.png', img)

TensorRT c++测试

cpp 文件

#include <fstream>
#include <numeric>
#include <opencv2/imgcodecs/imgcodecs.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <random>
#include <string>
#include <vector>
#include "mmdeploy/segmentor.h"
using namespace std;
vector<cv::Vec3b> gen_palette(int num_classes) {
  std::mt19937 gen;
  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
  vector<cv::Vec3b> palette;
  palette.reserve(num_classes);
  for (auto i = 0; i < num_classes; ++i) {
    palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
  return palette;
int main(int argc, char* argv[]) {
  if (argc != 4) {
    fprintf(stderr, "usage:\n  image_segmentation device_name model_path image_path\n");
    return 1;
  auto device_name = argv[1];
  auto model_path = argv[2];
  auto image_path = argv[3];
  cv::Mat img = cv::imread(image_path);
  if (!img.data) {
    fprintf(stderr, "failed to load image: %s\n", image_path);
    return 1;
  mmdeploy_segmentor_t segmentor{};
  int status{};
  status = mmdeploy_segmentor_create_by_path(model_path, device_name, 0, &segmentor);
  if (status != MMDEPLOY_SUCCESS) {
    fprintf(stderr, "failed to create segmentor, code: %d\n", (int)status);
    return 1;
  mmdeploy_mat_t mat{
      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
  mmdeploy_segmentation_t* result{};
  status = mmdeploy_segmentor_apply(segmentor, &mat, 1, &result);
  if (status != MMDEPLOY_SUCCESS) {
    fprintf(stderr, "failed to apply segmentor, code: %d\n", (int)status);
    return 1;
  auto palette = gen_palette(result->classes + 1);
  cv::Mat color_mask = cv::Mat::zeros(result->height, result->width, CV_8UC3);
  int pos = 0;
  int total = color_mask.rows * color_mask.cols;
  std::vector<int> idxs(result->classes);
  for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter) {
    // output mask
    if (result->mask) {
      *iter = palette[result->mask[pos++]];
    // output score
    if (result->score) {
      std::iota(idxs.begin(), idxs.end(), 0);
      auto k =
          std::max_element(idxs.begin(), idxs.end(),
                           [&](int i, int j) {
                             return result->score[i * total + pos] < result->score[j * total + pos];
          idxs.begin();
      *iter = palette[k];
      pos += 1;
  img = img * 0.5 + color_mask * 0.5;
  cv::imwrite("output_segmentation.png", img);
  mmdeploy_segmentor_release_result(result, 1);
  mmdeploy_segmentor_destroy(segmentor);
  return 0;
cmake_minimum_required(VERSION 3.14)
project(mmdeploy-example)
if (NOT (${CMAKE_PROJECT_NAME} STREQUAL "MMDeploy"))
    find_package(MMDeploy REQUIRED)
endif ()
set(name image_segmentation)
# Search for c/cpp sources
file(GLOB _SRCS ${name}.c*)
add_executable(${name} ${_SRCS})
if (NOT (MSVC OR APPLE))
    # Disable new dtags so that executables can run even without LD_LIBRARY_PATH set
    target_link_libraries(${name} PRIVATE -Wl,--disable-new-dtags)
endif ()
if (MMDEPLOY_BUILD_SDK_MONOLITHIC)
    target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS})
else ()
    # Load MMDeploy modules
    mmdeploy_load_static(${name} MMDeployStaticModules)
    mmdeploy_load_dynamic(${name} MMDeployDynamicModules)
    # Link to MMDeploy libraries
    target_link_libraries(${name} PRIVATE MMDeployLibs ${OpenCV_LIBS})
endif ()
install(TARGETS ${name} RUNTIME DESTINATION bin)

执行脚本

cd build &&
export CUDA_DIR=/usr/local/cuda
export TENSORRT_DIR=~/Code/TensorRT-8.2.5.1
export ONNXRUNTIME_DIR=~/Code/onnxruntime-linux-x64-1.8.1
export CUDNN_DIR=~/Code/cudnn/cuda
export MMDeploy_DIR=~/Code/mmdeploy
export LD_LIBRARY_PATH=$MMDeploy_DIR/build/lib:$CUDA_DIR/lib64:/$CUDNN_DIR/lib64:$TENSORRT_DIR/lib:$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH