"""TensorRT 纯推理延迟测试""" import numpy as np import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit import time engine_path = './models/yolo11n.engine' with open(engine_path, 'rb') as f: runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() input_shape = (1, 3, 480, 480) input_data = np.random.randn(*input_shape).astype(np.float32) context.set_input_shape('images', input_shape) output_shape = tuple(max(1, s) for s in engine.get_binding_shape(1)) output_size = int(np.prod(output_shape)) h_input = cuda.pagelocked_empty(input_data.size, np.float32) h_output = cuda.pagelocked_empty(output_size, np.float32) np.copyto(h_input, input_data.ravel()) d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) bindings = [int(d_input), int(d_output)] # Warmup for _ in range(10): cuda.memcpy_htod(d_input, h_input) context.execute_v2(bindings=bindings) cuda.memcpy_dtoh(h_output, d_output) # Benchmark times = [] for _ in range(100): start = time.perf_counter() cuda.memcpy_htod(d_input, h_input) context.execute_v2(bindings=bindings) cuda.memcpy_dtoh(h_output, d_output) times.append((time.perf_counter() - start) * 1000) print(f'TensorRT 纯推理延迟 (batch=1):') print(f' 平均: {np.mean(times):.2f}ms') print(f' 中位数: {np.median(times):.2f}ms') print(f' 最小: {np.min(times):.2f}ms') print(f' 最大: {np.max(times):.2f}ms') print(f' P95: {np.percentile(times, 95):.2f}ms') print() # 再测试 batch=4 print("测试 batch=4...") input_shape_4 = (4, 3, 480, 480) input_data_4 = np.random.randn(*input_shape_4).astype(np.float32) context.set_input_shape('images', input_shape_4) output_shape_4 = (4, 84, 4725) output_size_4 = int(np.prod(output_shape_4)) h_input_4 = cuda.pagelocked_empty(input_data_4.size, np.float32) h_output_4 = cuda.pagelocked_empty(output_size_4, np.float32) np.copyto(h_input_4, input_data_4.ravel()) d_input_4 = cuda.mem_alloc(h_input_4.nbytes) d_output_4 = cuda.mem_alloc(h_output_4.nbytes) bindings_4 = [int(d_input_4), int(d_output_4)] # Warmup for _ in range(10): cuda.memcpy_htod(d_input_4, h_input_4) context.execute_v2(bindings=bindings_4) cuda.memcpy_dtoh(h_output_4, d_output_4) # Benchmark times_4 = [] for _ in range(100): start = time.perf_counter() cuda.memcpy_htod(d_input_4, h_input_4) context.execute_v2(bindings=bindings_4) cuda.memcpy_dtoh(h_output_4, d_output_4) times_4.append((time.perf_counter() - start) * 1000) print(f'TensorRT 纯推理延迟 (batch=4):') print(f' 平均: {np.mean(times_4):.2f}ms') print(f' 中位数: {np.median(times_4):.2f}ms') print(f' 最小: {np.min(times_4):.2f}ms') print(f' 最大: {np.max(times_4):.2f}ms') print(f' P95: {np.percentile(times_4, 95):.2f}ms')