首发于 YoloX
YOLOX之TensorRT部署:Python版

YOLOX之TensorRT部署:Python版

  1. Tensorrt安装:请参考下面YOLOv5的C++版(环境安装有python配置)

2. PyCUDA安装:请参考下面YOLOv5的Python版

3. 主要流程

3.1 安装torch2trt

pip3 install torch2trt

3.2 执行tools/trt.py将ckpt文件转换为engine文件

python3 tools/trt.py -f exps/default/yolox_s.py 
                     -c weights/yolox_s.pth

注:torch2trt受TensorRT版本限制,有一定的局限性,C++版序列化代码正在编辑中,后续在专栏推出。

3.3 主要流程

3.3.1 创建runtime

import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
cfx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)

3.3.2 反序列化并创建context

# engine_file_path = "yolox.engine"
with open(engine_file_path, "rb") as f:
     engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

3.3.3 分配空间并绑定输入输出

host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
    size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
    dtype = trt.nptype(engine.get_binding_dtype(binding))
    # 分配主机和设备buffers
    host_mem = cuda.pagelocked_empty(size, dtype)    
    cuda_mem = cuda.mem_alloc(host_mem.nbytes)      
    # 将设备buffer绑定到设备.
    bindings.append(int(cuda_mem))
    # 绑定到输入输出
    if engine.binding_is_input(binding):
         host_inputs.append(host_mem)           
         cuda_inputs.append(cuda_mem)    
    else:
         host_outputs.append(host_mem)
         cuda_outputs.append(cuda_mem)

3.3.4 图像预处理

def preprocess_image_yolox(image, input_w,input_h,mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
    srcImg = image.copy()
    if len(image.shape) == 3:
        padded_img = np.ones((input_h, input_w, 3)) * 114.0
    else:
        padded_img = np.ones((input_h, input_w)) * 114.0
    img = np.array(image)
    r = min(input_h / img.shape[0], input_w / img.shape[1])
    resized_img = cv2.resize(
        img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LINEAR
    ).astype(np.float32)
    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
    image = padded_img
    image = image.astype(np.float32)
    image = image[:, :, ::-1]
    image /= 255.0
    if mean is not None:
        image -= mean
    if std is not None:
        image /= std
    image = np.transpose(image, [2, 0, 1])      
    # CHW -> NCHW
    image = np.expand_dims(image, axis=0)
    # Convert the image to row-major order, also known as "C order":
    image = np.ascontiguousarray(image)
    return srcImg,image, r

3.3.5 执行推理

# 拷贝输入图像到主机buffer
np.copyto(host_inputs[0], input_image.ravel())
# 将输入数据转到GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# 推理.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# 将推理结果传到CPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# 同步 stream
stream.synchronize()
# 拿到推理结果 batch_size = 1
output = host_outputs[0]

3.3.5 后处理操作

此处代码区别于yolov5的代码,主要是要理解yolox的head的代码,然后将结果进行解析,这部分花费本人一些时间,如果有需要,麻烦添加关注,留言获取。
此次补充:公开后处理部分
def post_process_yolox(self,prediction, ratio,num_classes=80, conf_thre=0.6, nms_thre=0.45):       
        pred = np.reshape(prediction, (1,-1, 5+num_classes))#[:num, :]
        pred = self.decode_outputs(pred)
        pred = torch.Tensor(pred).cpu()
        box_corner = pred.new(pred.shape)
        box_corner[:, :, 0] = pred[:, :, 0] - pred[:, :, 2] / 2
        box_corner[:, :, 1] = pred[:, :, 1] - pred[:, :, 3] / 2
        box_corner[:, :, 2] = pred[:, :, 0] + pred[:, :, 2] / 2
        box_corner[:, :, 3] = pred[:, :, 1] + pred[:, :, 3] / 2
        pred[:, :, :4] = box_corner[:, :, :4]
        output = [None for _ in range(len(pred))]
        for i, image_pred in enumerate(pred):
            # If none are remaining => process next image
            if not image_pred.size(0):
                continue
            # Get score and class with highest confidence
            class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
            conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
            # _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000)
            # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
            detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
            detections = detections[conf_mask]
            if not detections.size(0):
                continue
            nms_out_index = torchvision.ops.batched_nms(
                detections[:, :4],
                detections[:, 4] * detections[:, 5],
                detections[:, 6],
                nms_thre,
            detections = detections[nms_out_index]
            if output[i] is None:
                output[i] = detections
            else:
                output[i] = torch.cat((output[i], detections))
        outputs = output[0]
        bboxes = outputs[:, 0:4]