Files
security-ai-edge/benchmark_trt.py
16337 c17f983ab3 perf: batch=1 优化减少延迟
- settings: batch_size=41
- tensorrt_engine: BATCH_SIZE=41
- preprocessor: 移除 padding 逻辑,直接 batch=1
- 预处理延迟从 17ms  5ms
2026-02-02 15:25:13 +08:00

97 lines
2.8 KiB
Python

"""TensorRT 纯推理延迟测试"""
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
engine_path = './models/yolo11n.engine'
with open(engine_path, 'rb') as f:
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
input_shape = (1, 3, 480, 480)
input_data = np.random.randn(*input_shape).astype(np.float32)
context.set_input_shape('images', input_shape)
output_shape = tuple(max(1, s) for s in engine.get_binding_shape(1))
output_size = int(np.prod(output_shape))
h_input = cuda.pagelocked_empty(input_data.size, np.float32)
h_output = cuda.pagelocked_empty(output_size, np.float32)
np.copyto(h_input, input_data.ravel())
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
bindings = [int(d_input), int(d_output)]
# Warmup
for _ in range(10):
cuda.memcpy_htod(d_input, h_input)
context.execute_v2(bindings=bindings)
cuda.memcpy_dtoh(h_output, d_output)
# Benchmark
times = []
for _ in range(100):
start = time.perf_counter()
cuda.memcpy_htod(d_input, h_input)
context.execute_v2(bindings=bindings)
cuda.memcpy_dtoh(h_output, d_output)
times.append((time.perf_counter() - start) * 1000)
print(f'TensorRT 纯推理延迟 (batch=1):')
print(f' 平均: {np.mean(times):.2f}ms')
print(f' 中位数: {np.median(times):.2f}ms')
print(f' 最小: {np.min(times):.2f}ms')
print(f' 最大: {np.max(times):.2f}ms')
print(f' P95: {np.percentile(times, 95):.2f}ms')
print()
# 再测试 batch=4
print("测试 batch=4...")
input_shape_4 = (4, 3, 480, 480)
input_data_4 = np.random.randn(*input_shape_4).astype(np.float32)
context.set_input_shape('images', input_shape_4)
output_shape_4 = (4, 84, 4725)
output_size_4 = int(np.prod(output_shape_4))
h_input_4 = cuda.pagelocked_empty(input_data_4.size, np.float32)
h_output_4 = cuda.pagelocked_empty(output_size_4, np.float32)
np.copyto(h_input_4, input_data_4.ravel())
d_input_4 = cuda.mem_alloc(h_input_4.nbytes)
d_output_4 = cuda.mem_alloc(h_output_4.nbytes)
bindings_4 = [int(d_input_4), int(d_output_4)]
# Warmup
for _ in range(10):
cuda.memcpy_htod(d_input_4, h_input_4)
context.execute_v2(bindings=bindings_4)
cuda.memcpy_dtoh(h_output_4, d_output_4)
# Benchmark
times_4 = []
for _ in range(100):
start = time.perf_counter()
cuda.memcpy_htod(d_input_4, h_input_4)
context.execute_v2(bindings=bindings_4)
cuda.memcpy_dtoh(h_output_4, d_output_4)
times_4.append((time.perf_counter() - start) * 1000)
print(f'TensorRT 纯推理延迟 (batch=4):')
print(f' 平均: {np.mean(times_4):.2f}ms')
print(f' 中位数: {np.median(times_4):.2f}ms')
print(f' 最小: {np.min(times_4):.2f}ms')
print(f' 最大: {np.max(times_4):.2f}ms')
print(f' P95: {np.percentile(times_4, 95):.2f}ms')