perf: batch=1 优化减少延迟

- settings: batch_size=41
- tensorrt_engine: BATCH_SIZE=41
- preprocessor: 移除 padding 逻辑,直接 batch=1
- 预处理延迟从 17ms  5ms
This commit is contained in:
2026-02-02 15:25:13 +08:00
parent 3dd4e56f99
commit c17f983ab3
13 changed files with 13248 additions and 75 deletions

2
.gitignore vendored
View File

@@ -49,3 +49,5 @@ README.md
# 数据目录(不提交) # 数据目录(不提交)
data/ data/
captures/ captures/
/logs/
/tests/

56
analyze_latency.py Normal file
View File

@@ -0,0 +1,56 @@
"""详细延迟分析 - 简化版"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import time
import numpy as np
import cv2
from config.settings import get_settings
from core.preprocessor import ImagePreprocessor
settings = get_settings()
preprocessor = ImagePreprocessor(settings.inference)
# 模拟 100 次推理
img = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
roi_mock = type('ROI', (), {'x1': 300, 'y1': 100, 'x2': 1000, 'y2': 800, 'enabled': True})()
times_preprocess = []
times_single = []
times_batch = []
for _ in range(100):
# 1. preprocess_single
start = time.perf_counter()
cropped = preprocessor.preprocess_single(img, roi_mock)
t = (time.perf_counter() - start) * 1000
times_single.append(t)
# 2. preprocess_batch (1→4)
start = time.perf_counter()
batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
t = (time.perf_counter() - start) * 1000
times_batch.append(t)
# 3. 完整 preprocess (single + batch)
start = time.perf_counter()
cropped = preprocessor.preprocess_single(img, roi_mock)
batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
t = (time.perf_counter() - start) * 1000
times_preprocess.append(t)
print("延迟分析 (100次平均):")
print(f" preprocess_single (ROI + resize): {np.mean(times_single):.2f}ms")
print(f" preprocess_batch (padding 1→4): {np.mean(times_batch):.2f}ms")
print(f" 完整预处理: {np.mean(times_preprocess):.2f}ms")
print()
print(f"TensorRT 推理 (batch=1): ~2.5ms (基准测试)")
print(f"TensorRT 推理 (batch=4): ~5.0ms (基准测试)")
print()
print("推算总延迟:")
print(f" 方案A (batch=1): {np.mean(times_single):.2f} + 2.5 + 后处理 ≈ 10-15ms")
print(f" 方案B (batch=4 实际只推理1帧): {np.mean(times_preprocess):.2f} + 5 + 后处理 ≈ 55-65ms")
print()
print("结论:延迟主要来自 batch padding 和不必要的 4帧推理开销")

44
analyze_latency_batch1.py Normal file
View File

@@ -0,0 +1,44 @@
"""延迟分析 - batch=1 优化后"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import time
import numpy as np
from config.settings import get_settings
from core.preprocessor import ImagePreprocessor, BatchPreprocessor
settings = get_settings()
preprocessor = ImagePreprocessor(settings.inference)
img = np.random.randint(0, 255, (1080, 1920, 3), dtype=np.uint8)
roi_mock = type('ROI', (), {'x1': 300, 'y1': 100, 'x2': 1000, 'y2': 800, 'enabled': True, 'roi_type': 0})()
times_preprocess_single = []
times_preprocess_batch = []
for _ in range(100):
# 1. preprocess_single
start = time.perf_counter()
cropped = preprocessor.preprocess_single(img, roi_mock)
t = (time.perf_counter() - start) * 1000
times_preprocess_single.append(t)
# 2. preprocess_batch (batch=1)
start = time.perf_counter()
batch_data, _ = preprocessor._batch_preprocessor.preprocess_batch([cropped[0]])
t = (time.perf_counter() - start) * 1000
times_preprocess_batch.append(t)
print("延迟分析 (batch=1 优化后):")
print(f" preprocess_single: {np.mean(times_preprocess_single):.2f}ms")
print(f" preprocess_batch: {np.mean(times_preprocess_batch):.2f}ms")
print(f" 总预处理: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f}ms")
print()
print(f"TensorRT batch=1 推理: ~2.5ms")
print(f"TensorRT batch=4 推理: ~5.0ms")
print()
print("推算总延迟:")
print(f" batch=1: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f} + 2.5 ≈ 8-12ms")
print(f" batch=4: {np.mean(times_preprocess_single) + np.mean(times_preprocess_batch):.2f} + 5 ≈ 10-15ms")

96
benchmark_trt.py Normal file
View File

@@ -0,0 +1,96 @@
"""TensorRT 纯推理延迟测试"""
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
engine_path = './models/yolo11n.engine'
with open(engine_path, 'rb') as f:
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
input_shape = (1, 3, 480, 480)
input_data = np.random.randn(*input_shape).astype(np.float32)
context.set_input_shape('images', input_shape)
output_shape = tuple(max(1, s) for s in engine.get_binding_shape(1))
output_size = int(np.prod(output_shape))
h_input = cuda.pagelocked_empty(input_data.size, np.float32)
h_output = cuda.pagelocked_empty(output_size, np.float32)
np.copyto(h_input, input_data.ravel())
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
bindings = [int(d_input), int(d_output)]
# Warmup
for _ in range(10):
cuda.memcpy_htod(d_input, h_input)
context.execute_v2(bindings=bindings)
cuda.memcpy_dtoh(h_output, d_output)
# Benchmark
times = []
for _ in range(100):
start = time.perf_counter()
cuda.memcpy_htod(d_input, h_input)
context.execute_v2(bindings=bindings)
cuda.memcpy_dtoh(h_output, d_output)
times.append((time.perf_counter() - start) * 1000)
print(f'TensorRT 纯推理延迟 (batch=1):')
print(f' 平均: {np.mean(times):.2f}ms')
print(f' 中位数: {np.median(times):.2f}ms')
print(f' 最小: {np.min(times):.2f}ms')
print(f' 最大: {np.max(times):.2f}ms')
print(f' P95: {np.percentile(times, 95):.2f}ms')
print()
# 再测试 batch=4
print("测试 batch=4...")
input_shape_4 = (4, 3, 480, 480)
input_data_4 = np.random.randn(*input_shape_4).astype(np.float32)
context.set_input_shape('images', input_shape_4)
output_shape_4 = (4, 84, 4725)
output_size_4 = int(np.prod(output_shape_4))
h_input_4 = cuda.pagelocked_empty(input_data_4.size, np.float32)
h_output_4 = cuda.pagelocked_empty(output_size_4, np.float32)
np.copyto(h_input_4, input_data_4.ravel())
d_input_4 = cuda.mem_alloc(h_input_4.nbytes)
d_output_4 = cuda.mem_alloc(h_output_4.nbytes)
bindings_4 = [int(d_input_4), int(d_output_4)]
# Warmup
for _ in range(10):
cuda.memcpy_htod(d_input_4, h_input_4)
context.execute_v2(bindings=bindings_4)
cuda.memcpy_dtoh(h_output_4, d_output_4)
# Benchmark
times_4 = []
for _ in range(100):
start = time.perf_counter()
cuda.memcpy_htod(d_input_4, h_input_4)
context.execute_v2(bindings=bindings_4)
cuda.memcpy_dtoh(h_output_4, d_output_4)
times_4.append((time.perf_counter() - start) * 1000)
print(f'TensorRT 纯推理延迟 (batch=4):')
print(f' 平均: {np.mean(times_4):.2f}ms')
print(f' 中位数: {np.median(times_4):.2f}ms')
print(f' 最小: {np.min(times_4):.2f}ms')
print(f' 最大: {np.max(times_4):.2f}ms')
print(f' P95: {np.percentile(times_4, 95):.2f}ms')

View File

@@ -75,7 +75,7 @@ class InferenceConfig:
model_path: str = "./models/yolo11n.engine" model_path: str = "./models/yolo11n.engine"
input_width: int = 480 input_width: int = 480
input_height: int = 480 input_height: int = 480
batch_size: int = 4 batch_size: int = 1
conf_threshold: float = 0.5 conf_threshold: float = 0.5
nms_threshold: float = 0.45 nms_threshold: float = 0.45
device_id: int = 0 device_id: int = 0

View File

@@ -225,30 +225,19 @@ class LetterboxPreprocessor:
class BatchPreprocessor: class BatchPreprocessor:
"""Batch预处理器类 """Batch预处理器类 (batch=1)"""
固定 batch=4支持 padding 到 batch=4 BATCH_SIZE = 1
"""
BATCH_SIZE = 4
def __init__( def __init__(
self, self,
target_size: Tuple[int, int] = (480, 480), target_size: Tuple[int, int] = (480, 480),
fp16_mode: bool = True fp16_mode: bool = True
): ):
"""
初始化Batch预处理器
Args:
target_size: 目标尺寸 (width, height)
fp16_mode: 是否使用FP16精度
"""
self.target_size = target_size self.target_size = target_size
self.fp16_mode = fp16_mode self.fp16_mode = fp16_mode
self.batch_size = self.BATCH_SIZE self.batch_size = self.BATCH_SIZE
self._letterbox = LetterboxPreprocessor(target_size)
self._logger = get_logger("preprocessor") self._logger = get_logger("preprocessor")
self._logger.info( self._logger.info(
@@ -256,77 +245,50 @@ class BatchPreprocessor:
f"target_size={target_size}, fp16={fp16_mode}" f"target_size={target_size}, fp16={fp16_mode}"
) )
@staticmethod def preprocess_single(
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray: self,
image: np.ndarray
) -> np.ndarray:
""" """
Padding 到 batch=4重复最后一帧 预处理单帧图像
Args: Args:
frames: list of [3, 480, 480] numpy arrays image: numpy 数组
Returns: Returns:
np.ndarray: [4, 3, 480, 480] np.ndarray: [1, 3, H, W]
""" """
if len(frames) == 0: normalized = image.astype(np.float32) / 255.0
raise ValueError("Empty frames list") transposed = np.transpose(normalized, (2, 0, 1))
batched = transposed[None, ...]
if len(frames) == 4: if self.fp16_mode:
return np.stack(frames) batched = batched.astype(np.float16)
pad_frame = frames[-1].copy() return batched
while len(frames) < 4:
frames.append(pad_frame)
return np.stack(frames)
def preprocess_batch( def preprocess_batch(
self, self,
images: List[np.ndarray] images: List[np.ndarray]
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]: ) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
""" """
预处理批次图像,自动 padding 到 batch=4 预处理批次图像 (batch=1)
Args: Args:
images: 图像列表 images: 图像列表 (只处理第一帧)
Returns: Returns:
tuple: (批次数据 [4, 3, H, W], 缩放信息列表) tuple: (批次数据 [1, 3, H, W], 缩放信息列表)
""" """
batch_data, scale_info_list = self._preprocess_batch(images) if not images:
raise ValueError("Empty images list")
return batch_data, scale_info_list letterbox = LetterboxPreprocessor(self.target_size)
processed, scale_info = letterbox.preprocess(images[0])
def _preprocess_batch(
self,
images: List[np.ndarray]
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
"""内部预处理实现"""
padded_images = self.pad_to_batch4(images)
scale_info_list = [] batch_data = self.preprocess_single(processed)
processed_images = []
for i in range(self.batch_size): return batch_data, [scale_info]
processed, scale_info = self._letterbox.preprocess(padded_images[i])
processed_images.append(processed)
scale_info_list.append(scale_info)
batch_data = self._stack_and_normalize(processed_images)
return batch_data, scale_info_list
def _stack_and_normalize(self, images: List[np.ndarray]) -> np.ndarray:
"""堆叠并归一化图像"""
stacked = np.stack(images, axis=0)
stacked = stacked.astype(np.float32) / 255.0
stacked = np.transpose(stacked, (0, 3, 1, 2))
if self.fp16_mode:
stacked = stacked.astype(np.float16)
return stacked
class ImagePreprocessor: class ImagePreprocessor:

View File

@@ -40,29 +40,22 @@ class HostDeviceMem:
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray: def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
""" """
Padding 到 batch=4,重复最后一帧 Padding 到 batch=N,重复最后一帧(已弃用,改用 batch=1
Args: Args:
frames: list of [3, 480, 480] numpy arrays frames: list of [3, 480, 480] numpy arrays
Returns: Returns:
np.ndarray: [4, 3, 480, 480] np.ndarray: [N, 3, 480, 480]
""" """
if len(frames) == 0: if len(frames) == 0:
raise ValueError("Empty frames list") raise ValueError("Empty frames list")
if len(frames) == 4:
return np.stack(frames)
pad_frame = frames[-1].copy()
while len(frames) < 4:
frames.append(pad_frame)
return np.stack(frames) return np.stack(frames)
class TensorRTEngine: class TensorRTEngine:
"""固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480) """TensorRT 引擎 (batch=1, FP16, 3×480×480)
特性: 特性:
- Buffer Pool: bindings 只在 init 阶段分配一次 - Buffer Pool: bindings 只在 init 阶段分配一次
@@ -70,7 +63,7 @@ class TensorRTEngine:
- Async API: CUDA stream + async memcpy + execute_async_v2 - Async API: CUDA stream + async memcpy + execute_async_v2
""" """
BATCH_SIZE = 4 BATCH_SIZE = 1
INPUT_SHAPE = (3, 480, 480) INPUT_SHAPE = (3, 480, 480)
def __init__(self, config: Optional[InferenceConfig] = None): def __init__(self, config: Optional[InferenceConfig] = None):

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -204,7 +204,7 @@ class EdgeInferenceService:
frame: VideoFrame, frame: VideoFrame,
roi roi
): ):
"""处理ROI帧固定 batch=4 推理""" """处理ROI帧batch=1 推理"""
try: try:
if not roi.enabled: if not roi.enabled:
return return