tensorrt動態輸入（Dynamic shapes）

作者：由 cccccddddd 發表于繪畫時間：2020-11-18

碰到有小夥伴在評論裡或者私信問全部程式碼，一直也沒有時間去回覆，乾脆直接貼出來吧，程式碼是根據網上的實現和官方的示例修改的，在我這裡能跑通，每個專案的需求都不一樣，具體的修改需要大家自由發揮。

import sys

import tensorrt as trt

import pycuda。driver as cuda

import pycuda。autoinit

import time

import copy

import numpy as np

import os

import torch

import cv2

TRT_LOGGER = trt。Logger（trt。Logger。INFO）

a=（int）（trt。NetworkDefinitionCreationFlag。EXPLICIT_BATCH）

EXPLICIT_BATCH = 1 << （int）（trt。NetworkDefinitionCreationFlag。EXPLICIT_BATCH）

device=‘cuda：0’

def GiB（val）：

return val * 1 << 30

def build_engine（onnx_path， using_half，engine_file，dynamic_input=False）：

#見文章前段

def allocate_buffers（engine， is_explicit_batch=False， input_shape=None）：

inputs = ［］

outputs = ［］

bindings = ［］

class HostDeviceMem（object）：

def __init__（self， host_mem， device_mem）：

self。host = host_mem

self。device = device_mem

def __str__（self）：

return “Host：\n” + str（self。host） + “\nDevice：\n” + str（self。device）

def __repr__（self）：

return self。__str__（）

for binding in engine：

dims = engine。get_binding_shape（binding）

print（dims）

if dims［-1］ == -1：

assert（input_shape is not None）

dims［-2］，dims［-1］ = input_shape

size = trt。volume（dims） * engine。max_batch_size#The maximum batch size which can be used for inference。

dtype = trt。nptype（engine。get_binding_dtype（binding））

# Allocate host and device buffers

host_mem = cuda。pagelocked_empty（size， dtype）

device_mem = cuda。mem_alloc（host_mem。nbytes）

# Append the device buffer to device bindings。

bindings。append（int（device_mem））

if engine。binding_is_input（binding）：#Determine whether a binding is an input binding。

inputs。append（HostDeviceMem（host_mem， device_mem））

else：

outputs。append（HostDeviceMem（host_mem， device_mem））

return inputs， outputs， bindings

def preprocess_image（imagepath）：

origin_img = cv2。imread（imagepath） # BGR

origin_height=origin_img。shape［0］

origin_width=origin_img。shape［1］

new_height=1248

new_width=1248

pad_img=cv2。resize（origin_img，（new_height，new_width））

pad_img = pad_img［：，：，：：-1］。transpose（2， 0， 1）

pad_img = pad_img。astype（np。float32）

pad_img /= 255。0

pad_img = np。ascontiguousarray（pad_img）

pad_img = np。expand_dims（pad_img， axis=0）

return pad_img，（new_height，new_width），（origin_height，origin_width）

def profile_trt（engine， imagepath，batch_size， num_warmups=0， num_iters=1）：

#見文章前段

if __name__ == ‘__main__’：

onnx_path=‘Singlegpu。onnx’

usinghalf=True

batch_size=1

imagepath=‘testimgs/’

engine_file=‘Singlegpu。engine’

init_engine=True

load_engine=False

if init_engine：

trt_engine=build_engine（onnx_path，usinghalf，engine_file，dynamic_input=True）

print（‘engine built successfully！’）

with open（engine_file， “wb”） as f：

f。write（trt_engine。serialize（））

print（‘save engine successfully’）

if load_engine：

trt。init_libnvinfer_plugins（None， ‘’）

with open（engine_file， “rb”） as f， trt。Runtime（TRT_LOGGER） as runtime：

trt_engine=runtime。deserialize_cuda_engine（f。read（））

if os。path。isdir（imagepath）：

imagepaths=［］

for imagname in os。listdir（imagepath）：

temppath=os。path。join（imagepath，imagname）

imagepaths。append（temppath）

else：

imagepaths=［imagepath］

for tempimagepath in imagepaths：

trt_result = profile_trt（trt_engine，tempimagepath， batch_size， 0， 1）

trt_result=（trt_result > 0。5）

cv2。imwrite（tempimagepath。replace（‘。jpg’，‘result1。png’），trt_result*255）

———————————————手動分割線（2021。03。18）———————————————

記錄此貼的原因有兩個：1。肯定也有很多人需要。2。就我搜索的帖子沒一個講的明明白白的，官方文件也不利索，需要連蒙帶猜。話不多少，直接上程式碼。

以pytorch轉onnx轉tensorrt為例，動態shape是影象的長寬。

pytorch轉onnx：

def export_onnx（model，image_shape，onnx_path， batch_size=1）：

x，y=image_shape

img = torch。zeros（（batch_size， 3， x， y））

dynamic_onnx=True

if dynamic_onnx：

dynamic_ax = {‘input_1’ ： {2 ： ‘image_height’，3：‘image_wdith’}，

‘output_1’ ： {2 ： ‘image_height’，3：‘image_wdith’}}

torch。onnx。export（model，（img）， onnx_path，

input_names=［“input_1”］， output_names=［“output_1”］， verbose=False， opset_version=11，dynamic_axes=dynamic_ax）

else：

torch。onnx。export（model，（img）， onnx_path，

input_names=［“input_1”］， output_names=［“output_1”］， verbose=False， opset_version=11

）

onnx轉tensorrt：

按照nvidia官方文件對dynamic shape的定義，所謂動態，無非是定義engine的時候不指定，用-1代替，在推理的時候再確定，因此建立engine 和推理部分的程式碼都需要修改。

建立engine時，從onnx讀取的network，本身的輸入輸出就是dynamic shapes，只需要增加optimization_profile來確定一下輸入的尺寸範圍。

def build_engine（onnx_path， using_half，engine_file，dynamic_input=True）：

trt。init_libnvinfer_plugins（None， ‘’）

with trt。Builder（TRT_LOGGER） as builder， builder。create_network（EXPLICIT_BATCH） as network， trt。OnnxParser（network， TRT_LOGGER） as parser：

builder。max_batch_size = 1 # always 1 for explicit batch

config = builder。create_builder_config（）

config。max_workspace_size = GiB（1）

if using_half：

config。set_flag（trt。BuilderFlag。FP16）

# Load the Onnx model and parse it in order to populate the TensorRT network。

with open（onnx_path， ‘rb’） as model：

if not parser。parse（model。read（））：

print （‘ERROR： Failed to parse the ONNX file。’）

for error in range（parser。num_errors）：

print （parser。get_error（error））

return None

##增加部分

if dynamic_input：

profile = builder。create_optimization_profile（）；

profile。set_shape（“input_1”，（1，3，512，512），（1，3，1600，1600），（1，3，1024，1024））

config。add_optimization_profile（profile）

#加上一個sigmoid層

previous_output = network。get_output（0）

network。unmark_output（previous_output）

sigmoid_layer=network。add_activation（previous_output，trt。ActivationType。SIGMOID）

network。mark_output（sigmoid_layer。get_output（0））

return builder。build_engine（network， config）

進行推理時，有個不小的暗坑，按照我之前的理解，既然動態輸入，我只需要在給輸入分配合適的快取，然後不管什麼尺寸直接推理就行了唄，事實證明還是年輕了。按照官方文件的提示，在推理的時候一定要增加這麼一行，context。active_optimization_profile = 0，來選擇對應的optimization_profile，ok，我加了，但是還是報錯了，原因是我們既然在定義engine的時候沒有定義輸入尺寸，那麼在推理的時候就需要根據實際的輸入定義好輸入尺寸。

def profile_trt（engine， imagepath，batch_size）：

assert（engine is not None）

input_image，input_shape=preprocess_image（imagepath）

segment_inputs， segment_outputs， segment_bindings = allocate_buffers（engine， True，input_shape）

stream = cuda。Stream（）

with engine。create_execution_context（） as context：

context。active_optimization_profile = 0#增加部分

origin_inputshape=context。get_binding_shape（0）

#增加部分

if （origin_inputshape［-1］==-1）：

origin_inputshape［-2］，origin_inputshape［-1］=（input_shape）

context。set_binding_shape（0，（origin_inputshape））

input_img_array = np。array（［input_image］ * batch_size）

img = torch。from_numpy（input_img_array）。float（）。numpy（）

segment_inputs［0］。host = img

［cuda。memcpy_htod_async（inp。device， inp。host， stream） for inp in segment_inputs］#Copy from the Python buffer src to the device pointer dest （an int or a DeviceAllocation） asynchronously，

stream。synchronize（）#Wait for all activity on this stream to cease， then return。

context。execute_async（bindings=segment_bindings， stream_handle=stream。handle）#Asynchronously execute inference on a batch。

stream。synchronize（）

［cuda。memcpy_dtoh_async（out。host， out。device， stream） for out in segment_outputs］#Copy from the device pointer src （an int or a DeviceAllocation） to the Python buffer dest asynchronously

stream。synchronize（）

results = np。array（segment_outputs［0］。host）。reshape（batch_size， input_shape［0］，input_shape［1］）

return results。transpose（1，2，0）

只是短短几行程式碼，結果折騰了一整天，不過好在解決了動態輸入的問題，不需要再寫一堆亂七八糟的程式碼，希望讓有緣人少走一點彎路。

標簽： Engine trt input img onnx

上一篇:714高炮網貸寶寶分期易達花果園，為了借款跟著APP的推薦被強制下款

下一篇：高中數學怎麼也學不好怎麼辦？