tensorrt動態輸入(Dynamic shapes)
碰到有小夥伴在評論裡或者私信問全部程式碼,一直也沒有時間去回覆,乾脆直接貼出來吧,程式碼是根據網上的實現和官方的示例修改的,在我這裡能跑通,每個專案的需求都不一樣,具體的修改需要大家自由發揮。
import sys
import tensorrt as trt
import pycuda。driver as cuda
import pycuda。autoinit
import time
import copy
import numpy as np
import os
import torch
import cv2
TRT_LOGGER = trt。Logger(trt。Logger。INFO)
a=(int)(trt。NetworkDefinitionCreationFlag。EXPLICIT_BATCH)
EXPLICIT_BATCH = 1 << (int)(trt。NetworkDefinitionCreationFlag。EXPLICIT_BATCH)
device=‘cuda:0’
def GiB(val):
return val * 1 << 30
def build_engine(onnx_path, using_half,engine_file,dynamic_input=False):
#見文章前段
def allocate_buffers(engine, is_explicit_batch=False, input_shape=None):
inputs = []
outputs = []
bindings = []
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self。host = host_mem
self。device = device_mem
def __str__(self):
return “Host:\n” + str(self。host) + “\nDevice:\n” + str(self。device)
def __repr__(self):
return self。__str__()
for binding in engine:
dims = engine。get_binding_shape(binding)
print(dims)
if dims[-1] == -1:
assert(input_shape is not None)
dims[-2],dims[-1] = input_shape
size = trt。volume(dims) * engine。max_batch_size#The maximum batch size which can be used for inference。
dtype = trt。nptype(engine。get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda。pagelocked_empty(size, dtype)
device_mem = cuda。mem_alloc(host_mem。nbytes)
# Append the device buffer to device bindings。
bindings。append(int(device_mem))
if engine。binding_is_input(binding):#Determine whether a binding is an input binding。
inputs。append(HostDeviceMem(host_mem, device_mem))
else:
outputs。append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
def preprocess_image(imagepath):
origin_img = cv2。imread(imagepath) # BGR
origin_height=origin_img。shape[0]
origin_width=origin_img。shape[1]
new_height=1248
new_width=1248
pad_img=cv2。resize(origin_img,(new_height,new_width))
pad_img = pad_img[:, :, ::-1]。transpose(2, 0, 1)
pad_img = pad_img。astype(np。float32)
pad_img /= 255。0
pad_img = np。ascontiguousarray(pad_img)
pad_img = np。expand_dims(pad_img, axis=0)
return pad_img,(new_height,new_width),(origin_height,origin_width)
def profile_trt(engine, imagepath,batch_size, num_warmups=0, num_iters=1):
#見文章前段
if __name__ == ‘__main__’:
onnx_path=‘Singlegpu。onnx’
usinghalf=True
batch_size=1
imagepath=‘testimgs/’
engine_file=‘Singlegpu。engine’
init_engine=True
load_engine=False
if init_engine:
trt_engine=build_engine(onnx_path,usinghalf,engine_file,dynamic_input=True)
print(‘engine built successfully!’)
with open(engine_file, “wb”) as f:
f。write(trt_engine。serialize())
print(‘save engine successfully’)
if load_engine:
trt。init_libnvinfer_plugins(None, ‘’)
with open(engine_file, “rb”) as f, trt。Runtime(TRT_LOGGER) as runtime:
trt_engine=runtime。deserialize_cuda_engine(f。read())
if os。path。isdir(imagepath):
imagepaths=[]
for imagname in os。listdir(imagepath):
temppath=os。path。join(imagepath,imagname)
imagepaths。append(temppath)
else:
imagepaths=[imagepath]
for tempimagepath in imagepaths:
trt_result = profile_trt(trt_engine,tempimagepath, batch_size, 0, 1)
trt_result=(trt_result > 0。5)
cv2。imwrite(tempimagepath。replace(‘。jpg’,‘result1。png’),trt_result*255)
———————————————手動分割線(2021。03。18)———————————————
記錄此貼的原因有兩個:1。肯定也有很多人需要 。2。就我搜索的帖子沒一個講的明明白白的,官方文件也不利索,需要連蒙帶猜。話不多少,直接上程式碼。
以pytorch轉onnx轉tensorrt為例,動態shape是影象的長寬。
pytorch轉onnx:
def export_onnx(model,image_shape,onnx_path, batch_size=1):
x,y=image_shape
img = torch。zeros((batch_size, 3, x, y))
dynamic_onnx=True
if dynamic_onnx:
dynamic_ax = {‘input_1’ : {2 : ‘image_height’,3:‘image_wdith’},
‘output_1’ : {2 : ‘image_height’,3:‘image_wdith’}}
torch。onnx。export(model, (img), onnx_path,
input_names=[“input_1”], output_names=[“output_1”], verbose=False, opset_version=11,dynamic_axes=dynamic_ax)
else:
torch。onnx。export(model, (img), onnx_path,
input_names=[“input_1”], output_names=[“output_1”], verbose=False, opset_version=11
)
onnx轉tensorrt:
按照nvidia官方文件對dynamic shape的定義,所謂動態,無非是定義engine的時候不指定,用-1代替,在推理的時候再確定,因此建立engine 和推理部分的程式碼都需要修改。
建立engine時,從onnx讀取的network,本身的輸入輸出就是dynamic shapes,只需要增加optimization_profile來確定一下輸入的尺寸範圍。
def build_engine(onnx_path, using_half,engine_file,dynamic_input=True):
trt。init_libnvinfer_plugins(None, ‘’)
with trt。Builder(TRT_LOGGER) as builder, builder。create_network(EXPLICIT_BATCH) as network, trt。OnnxParser(network, TRT_LOGGER) as parser:
builder。max_batch_size = 1 # always 1 for explicit batch
config = builder。create_builder_config()
config。max_workspace_size = GiB(1)
if using_half:
config。set_flag(trt。BuilderFlag。FP16)
# Load the Onnx model and parse it in order to populate the TensorRT network。
with open(onnx_path, ‘rb’) as model:
if not parser。parse(model。read()):
print (‘ERROR: Failed to parse the ONNX file。’)
for error in range(parser。num_errors):
print (parser。get_error(error))
return None
##增加部分
if dynamic_input:
profile = builder。create_optimization_profile();
profile。set_shape(“input_1”, (1,3,512,512), (1,3,1600,1600), (1,3,1024,1024))
config。add_optimization_profile(profile)
#加上一個sigmoid層
previous_output = network。get_output(0)
network。unmark_output(previous_output)
sigmoid_layer=network。add_activation(previous_output,trt。ActivationType。SIGMOID)
network。mark_output(sigmoid_layer。get_output(0))
return builder。build_engine(network, config)
進行推理時,有個不小的暗坑,按照我之前的理解,既然動態輸入,我只需要在給輸入分配合適的快取,然後不管什麼尺寸直接推理就行了唄,事實證明還是年輕了。按照官方文件的提示,在推理的時候一定要增加這麼一行,context。active_optimization_profile = 0,來選擇對應的optimization_profile,ok,我加了,但是還是報錯了,原因是我們既然在定義engine的時候沒有定義輸入尺寸,那麼在推理的時候就需要根據實際的輸入定義好輸入尺寸。
def profile_trt(engine, imagepath,batch_size):
assert(engine is not None)
input_image,input_shape=preprocess_image(imagepath)
segment_inputs, segment_outputs, segment_bindings = allocate_buffers(engine, True,input_shape)
stream = cuda。Stream()
with engine。create_execution_context() as context:
context。active_optimization_profile = 0#增加部分
origin_inputshape=context。get_binding_shape(0)
#增加部分
if (origin_inputshape[-1]==-1):
origin_inputshape[-2],origin_inputshape[-1]=(input_shape)
context。set_binding_shape(0,(origin_inputshape))
input_img_array = np。array([input_image] * batch_size)
img = torch。from_numpy(input_img_array)。float()。numpy()
segment_inputs[0]。host = img
[cuda。memcpy_htod_async(inp。device, inp。host, stream) for inp in segment_inputs]#Copy from the Python buffer src to the device pointer dest (an int or a DeviceAllocation) asynchronously,
stream。synchronize()#Wait for all activity on this stream to cease, then return。
context。execute_async(bindings=segment_bindings, stream_handle=stream。handle)#Asynchronously execute inference on a batch。
stream。synchronize()
[cuda。memcpy_dtoh_async(out。host, out。device, stream) for out in segment_outputs]#Copy from the device pointer src (an int or a DeviceAllocation) to the Python buffer dest asynchronously
stream。synchronize()
results = np。array(segment_outputs[0]。host)。reshape(batch_size, input_shape[0],input_shape[1])
return results。transpose(1,2,0)
只是短短几行程式碼,結果折騰了一整天,不過好在解決了動態輸入的問題,不需要再寫一堆亂七八糟的程式碼,希望讓有緣人少走一點彎路。