- settings: batch_size=41 - tensorrt_engine: BATCH_SIZE=41 - preprocessor: 移除 padding 逻辑,直接 batch=1 - 预处理延迟从 17ms 5ms
97 lines
2.8 KiB
Python
97 lines
2.8 KiB
Python
"""TensorRT 纯推理延迟测试"""
|
|
import numpy as np
|
|
import tensorrt as trt
|
|
import pycuda.driver as cuda
|
|
import pycuda.autoinit
|
|
import time
|
|
|
|
engine_path = './models/yolo11n.engine'
|
|
|
|
with open(engine_path, 'rb') as f:
|
|
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
|
|
engine = runtime.deserialize_cuda_engine(f.read())
|
|
|
|
context = engine.create_execution_context()
|
|
|
|
input_shape = (1, 3, 480, 480)
|
|
input_data = np.random.randn(*input_shape).astype(np.float32)
|
|
|
|
context.set_input_shape('images', input_shape)
|
|
|
|
output_shape = tuple(max(1, s) for s in engine.get_binding_shape(1))
|
|
output_size = int(np.prod(output_shape))
|
|
|
|
h_input = cuda.pagelocked_empty(input_data.size, np.float32)
|
|
h_output = cuda.pagelocked_empty(output_size, np.float32)
|
|
|
|
np.copyto(h_input, input_data.ravel())
|
|
|
|
d_input = cuda.mem_alloc(h_input.nbytes)
|
|
d_output = cuda.mem_alloc(h_output.nbytes)
|
|
|
|
bindings = [int(d_input), int(d_output)]
|
|
|
|
# Warmup
|
|
for _ in range(10):
|
|
cuda.memcpy_htod(d_input, h_input)
|
|
context.execute_v2(bindings=bindings)
|
|
cuda.memcpy_dtoh(h_output, d_output)
|
|
|
|
# Benchmark
|
|
times = []
|
|
for _ in range(100):
|
|
start = time.perf_counter()
|
|
cuda.memcpy_htod(d_input, h_input)
|
|
context.execute_v2(bindings=bindings)
|
|
cuda.memcpy_dtoh(h_output, d_output)
|
|
times.append((time.perf_counter() - start) * 1000)
|
|
|
|
print(f'TensorRT 纯推理延迟 (batch=1):')
|
|
print(f' 平均: {np.mean(times):.2f}ms')
|
|
print(f' 中位数: {np.median(times):.2f}ms')
|
|
print(f' 最小: {np.min(times):.2f}ms')
|
|
print(f' 最大: {np.max(times):.2f}ms')
|
|
print(f' P95: {np.percentile(times, 95):.2f}ms')
|
|
print()
|
|
|
|
# 再测试 batch=4
|
|
print("测试 batch=4...")
|
|
input_shape_4 = (4, 3, 480, 480)
|
|
input_data_4 = np.random.randn(*input_shape_4).astype(np.float32)
|
|
context.set_input_shape('images', input_shape_4)
|
|
|
|
output_shape_4 = (4, 84, 4725)
|
|
output_size_4 = int(np.prod(output_shape_4))
|
|
|
|
h_input_4 = cuda.pagelocked_empty(input_data_4.size, np.float32)
|
|
h_output_4 = cuda.pagelocked_empty(output_size_4, np.float32)
|
|
|
|
np.copyto(h_input_4, input_data_4.ravel())
|
|
|
|
d_input_4 = cuda.mem_alloc(h_input_4.nbytes)
|
|
d_output_4 = cuda.mem_alloc(h_output_4.nbytes)
|
|
|
|
bindings_4 = [int(d_input_4), int(d_output_4)]
|
|
|
|
# Warmup
|
|
for _ in range(10):
|
|
cuda.memcpy_htod(d_input_4, h_input_4)
|
|
context.execute_v2(bindings=bindings_4)
|
|
cuda.memcpy_dtoh(h_output_4, d_output_4)
|
|
|
|
# Benchmark
|
|
times_4 = []
|
|
for _ in range(100):
|
|
start = time.perf_counter()
|
|
cuda.memcpy_htod(d_input_4, h_input_4)
|
|
context.execute_v2(bindings=bindings_4)
|
|
cuda.memcpy_dtoh(h_output_4, d_output_4)
|
|
times_4.append((time.perf_counter() - start) * 1000)
|
|
|
|
print(f'TensorRT 纯推理延迟 (batch=4):')
|
|
print(f' 平均: {np.mean(times_4):.2f}ms')
|
|
print(f' 中位数: {np.median(times_4):.2f}ms')
|
|
print(f' 最小: {np.min(times_4):.2f}ms')
|
|
print(f' 最大: {np.max(times_4):.2f}ms')
|
|
print(f' P95: {np.percentile(times_4, 95):.2f}ms')
|