feat: TensorRT 固定 batch=4 重构
- tensorrt_engine.py 工业级 Buffer Pool - preprocessor.py 添加 pad_to_batch4() - postprocessor.py 支持批量输出 - settings.py 固定 batch_size=4
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -597,20 +597,21 @@ class PostProcessor:
|
||||
|
||||
output = outputs[0]
|
||||
|
||||
if len(output.shape) == 3:
|
||||
if output.ndim == 3:
|
||||
output = output[0]
|
||||
|
||||
num_detections = output.shape[0]
|
||||
if output.ndim == 2:
|
||||
output = output.reshape(-1)
|
||||
|
||||
num_detections = output.shape[0] // 85
|
||||
|
||||
boxes = []
|
||||
scores = []
|
||||
class_ids = []
|
||||
|
||||
for i in range(num_detections):
|
||||
detection = output[i]
|
||||
|
||||
if len(detection) < 6:
|
||||
continue
|
||||
start_idx = i * 85
|
||||
detection = output[start_idx:start_idx + 85]
|
||||
|
||||
x_center = detection[0]
|
||||
y_center = detection[1]
|
||||
@@ -637,16 +638,16 @@ class PostProcessor:
|
||||
y2 = y_center + height / 2
|
||||
|
||||
boxes.append([x1, y1, x2, y2])
|
||||
scores.append(total_conf)
|
||||
class_ids.append(class_id)
|
||||
scores.append(float(total_conf))
|
||||
class_ids.append(int(class_id))
|
||||
|
||||
if not boxes:
|
||||
return np.array([]), np.array([]), np.array([])
|
||||
|
||||
return (
|
||||
np.array(boxes),
|
||||
np.array(scores),
|
||||
np.array(class_ids)
|
||||
np.array(boxes, dtype=np.float32),
|
||||
np.array(scores, dtype=np.float32),
|
||||
np.array(class_ids, dtype=np.int32)
|
||||
)
|
||||
|
||||
def filter_by_roi(
|
||||
|
||||
@@ -227,13 +227,14 @@ class LetterboxPreprocessor:
|
||||
class BatchPreprocessor:
|
||||
"""Batch预处理器类
|
||||
|
||||
支持动态Batch大小,转换为NCHW格式,FP16精度
|
||||
固定 batch=4,支持 padding 到 batch=4
|
||||
"""
|
||||
|
||||
BATCH_SIZE = 4
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_size: Tuple[int, int] = (480, 480),
|
||||
max_batch_size: int = 8,
|
||||
fp16_mode: bool = True
|
||||
):
|
||||
"""
|
||||
@@ -241,44 +242,72 @@ class BatchPreprocessor:
|
||||
|
||||
Args:
|
||||
target_size: 目标尺寸 (width, height)
|
||||
max_batch_size: 最大Batch大小
|
||||
fp16_mode: 是否使用FP16精度
|
||||
"""
|
||||
self.target_size = target_size
|
||||
self.max_batch_size = max_batch_size
|
||||
self.fp16_mode = fp16_mode
|
||||
self.batch_size = self.BATCH_SIZE
|
||||
|
||||
self._letterbox = LetterboxPreprocessor(target_size)
|
||||
self._logger = get_logger("preprocessor")
|
||||
self._lock = threading.Lock()
|
||||
|
||||
self._memory_pool: List[np.ndarray] = []
|
||||
self._preallocated_size = max_batch_size
|
||||
self._logger.info(
|
||||
f"Batch预处理器: batch={self.batch_size}, "
|
||||
f"target_size={target_size}, fp16={fp16_mode}"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
|
||||
"""
|
||||
Padding 到 batch=4,重复最后一帧
|
||||
|
||||
Args:
|
||||
frames: list of [3, 480, 480] numpy arrays
|
||||
|
||||
Returns:
|
||||
np.ndarray: [4, 3, 480, 480]
|
||||
"""
|
||||
if len(frames) == 0:
|
||||
raise ValueError("Empty frames list")
|
||||
|
||||
if len(frames) == 4:
|
||||
return np.stack(frames)
|
||||
|
||||
pad_frame = frames[-1].copy()
|
||||
while len(frames) < 4:
|
||||
frames.append(pad_frame)
|
||||
|
||||
return np.stack(frames)
|
||||
|
||||
def preprocess_batch(
|
||||
self,
|
||||
images: List[np.ndarray]
|
||||
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
|
||||
"""
|
||||
预处理一个批次的图像
|
||||
预处理批次图像,自动 padding 到 batch=4
|
||||
|
||||
Args:
|
||||
images: 图像列表
|
||||
|
||||
Returns:
|
||||
tuple: (批次数据, 缩放信息列表)
|
||||
tuple: (批次数据 [4, 3, H, W], 缩放信息列表)
|
||||
"""
|
||||
batch_size = len(images)
|
||||
batch_size = min(batch_size, self.max_batch_size)
|
||||
batch_data, scale_info_list = self._preprocess_batch(images)
|
||||
|
||||
return batch_data, scale_info_list
|
||||
|
||||
def _preprocess_batch(
|
||||
self,
|
||||
images: List[np.ndarray]
|
||||
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
|
||||
"""内部预处理实现"""
|
||||
padded_images = self.pad_to_batch4(images)
|
||||
|
||||
scale_info_list = []
|
||||
processed_images = []
|
||||
|
||||
for i in range(batch_size):
|
||||
if i >= len(images):
|
||||
break
|
||||
|
||||
processed, scale_info = self._letterbox.preprocess(images[i])
|
||||
for i in range(self.batch_size):
|
||||
processed, scale_info = self._letterbox.preprocess(padded_images[i])
|
||||
processed_images.append(processed)
|
||||
scale_info_list.append(scale_info)
|
||||
|
||||
@@ -298,53 +327,6 @@ class BatchPreprocessor:
|
||||
stacked = stacked.astype(np.float16)
|
||||
|
||||
return stacked
|
||||
|
||||
def allocate_batch_memory(self, batch_size: int) -> np.ndarray:
|
||||
"""
|
||||
分配批次内存
|
||||
|
||||
Args:
|
||||
batch_size: 批次大小
|
||||
|
||||
Returns:
|
||||
预分配的numpy数组
|
||||
"""
|
||||
batch_size = min(batch_size, self.max_batch_size)
|
||||
|
||||
with self._lock:
|
||||
for mem in self._memory_pool:
|
||||
if mem.shape[0] == batch_size:
|
||||
return mem
|
||||
|
||||
height, width = self.target_size
|
||||
shape = (batch_size, 3, height, width)
|
||||
|
||||
if self.fp16_mode:
|
||||
mem = np.zeros(shape, dtype=np.float16)
|
||||
else:
|
||||
mem = np.zeros(shape, dtype=np.float32)
|
||||
|
||||
self._memory_pool.append(mem)
|
||||
|
||||
return mem
|
||||
|
||||
def release_memory(self):
|
||||
"""释放内存池"""
|
||||
with self._lock:
|
||||
self._memory_pool.clear()
|
||||
self._logger.info("预处理内存池已释放")
|
||||
|
||||
def get_memory_usage(self) -> Dict[str, int]:
|
||||
"""获取内存使用情况"""
|
||||
with self._lock:
|
||||
total_bytes = sum(
|
||||
mem.nbytes for mem in self._memory_pool
|
||||
)
|
||||
return {
|
||||
"total_bytes": total_bytes,
|
||||
"total_mb": total_bytes / (1024 ** 2),
|
||||
"block_count": len(self._memory_pool)
|
||||
}
|
||||
|
||||
|
||||
class ImagePreprocessor:
|
||||
@@ -372,7 +354,6 @@ class ImagePreprocessor:
|
||||
)
|
||||
self._batch_preprocessor = BatchPreprocessor(
|
||||
target_size=(config.input_width, config.input_height),
|
||||
max_batch_size=config.max_batch_size,
|
||||
fp16_mode=config.fp16_mode
|
||||
)
|
||||
|
||||
@@ -380,7 +361,7 @@ class ImagePreprocessor:
|
||||
self._logger.info(
|
||||
f"图像预处理器初始化完成: "
|
||||
f"输入尺寸 {config.input_width}x{config.input_height}, "
|
||||
f"Batch大小 {config.batch_size}-{config.max_batch_size}, "
|
||||
f"Batch大小 {self._batch_preprocessor.batch_size}, "
|
||||
f"FP16模式 {config.fp16_mode}"
|
||||
)
|
||||
|
||||
@@ -416,15 +397,17 @@ class ImagePreprocessor:
|
||||
rois: Optional[List[Optional[ROIInfo]]] = None
|
||||
) -> Tuple[np.ndarray, List[Tuple[float, float, float, float]]]:
|
||||
"""
|
||||
预处理批次图像
|
||||
预处理批次图像,自动 padding 到 batch=4
|
||||
|
||||
Args:
|
||||
images: 原始图像列表
|
||||
rois: 可选的ROI配置列表
|
||||
|
||||
Returns:
|
||||
tuple: (批次数据, 缩放信息列表)
|
||||
tuple: (批次数据 [4, 3, H, W], 缩放信息列表)
|
||||
"""
|
||||
from core.tensorrt_engine import pad_to_batch4
|
||||
|
||||
if rois is None:
|
||||
rois = [None] * len(images)
|
||||
|
||||
@@ -436,7 +419,7 @@ class ImagePreprocessor:
|
||||
processed_images.append(processed)
|
||||
scale_info_list.append(scale_info)
|
||||
|
||||
batch_data = self._batch_preprocessor._stack_and_normalize(processed_images)
|
||||
batch_data = self._batch_preprocessor.preprocess_batch(processed_images)
|
||||
|
||||
return batch_data, scale_info_list
|
||||
|
||||
@@ -463,13 +446,11 @@ class ImagePreprocessor:
|
||||
"config": {
|
||||
"input_width": self.config.input_width,
|
||||
"input_height": self.config.input_height,
|
||||
"batch_size": self.config.batch_size,
|
||||
"max_batch_size": self.config.max_batch_size,
|
||||
"batch_size": self._batch_preprocessor.batch_size,
|
||||
"fp16_mode": self.config.fp16_mode,
|
||||
},
|
||||
"memory": self._batch_preprocessor.get_memory_usage(),
|
||||
}
|
||||
|
||||
def release_resources(self):
|
||||
"""释放资源"""
|
||||
self._batch_preprocessor.release_memory()
|
||||
self._logger.info("预处理器资源已释放")
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
"""
|
||||
TensorRT推理引擎模块
|
||||
固定 batch=4, FP16, 3×480×480
|
||||
工业级实现:Buffer Pool、异步推理、性能监控
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
@@ -38,8 +38,31 @@ class HostDeviceMem:
|
||||
return f"Host:{self.host.shape}, Device:{int(self.device)}"
|
||||
|
||||
|
||||
def pad_to_batch4(frames: List[np.ndarray]) -> np.ndarray:
|
||||
"""
|
||||
Padding 到 batch=4,重复最后一帧
|
||||
|
||||
Args:
|
||||
frames: list of [3, 480, 480] numpy arrays
|
||||
|
||||
Returns:
|
||||
np.ndarray: [4, 3, 480, 480]
|
||||
"""
|
||||
if len(frames) == 0:
|
||||
raise ValueError("Empty frames list")
|
||||
|
||||
if len(frames) == 4:
|
||||
return np.stack(frames)
|
||||
|
||||
pad_frame = frames[-1].copy()
|
||||
while len(frames) < 4:
|
||||
frames.append(pad_frame)
|
||||
|
||||
return np.stack(frames)
|
||||
|
||||
|
||||
class TensorRTEngine:
|
||||
"""工业级 TensorRT 引擎
|
||||
"""固定 batch TensorRT 引擎 (batch=4, FP16, 3×480×480)
|
||||
|
||||
特性:
|
||||
- Buffer Pool: bindings 只在 init 阶段分配一次
|
||||
@@ -47,6 +70,9 @@ class TensorRTEngine:
|
||||
- Async API: CUDA stream + async memcpy + execute_async_v2
|
||||
"""
|
||||
|
||||
BATCH_SIZE = 4
|
||||
INPUT_SHAPE = (3, 480, 480)
|
||||
|
||||
def __init__(self, config: Optional[InferenceConfig] = None):
|
||||
if not TRT_AVAILABLE:
|
||||
raise RuntimeError("TensorRT 未安装,请先安装 tensorrt 库")
|
||||
@@ -68,7 +94,6 @@ class TensorRTEngine:
|
||||
self._bindings: List[int] = []
|
||||
self._inputs: List[HostDeviceMem] = []
|
||||
self._outputs: List[HostDeviceMem] = []
|
||||
self._binding_names: Dict[int, str] = {}
|
||||
|
||||
self._performance_stats = {
|
||||
"inference_count": 0,
|
||||
@@ -81,8 +106,8 @@ class TensorRTEngine:
|
||||
self._logger.info(
|
||||
f"TensorRT 引擎初始化: "
|
||||
f"{config.model_path}, "
|
||||
f"{config.input_width}x{config.input_height}, "
|
||||
f"batch={config.batch_size}, "
|
||||
f"batch={self.BATCH_SIZE}, "
|
||||
f"shape={self.INPUT_SHAPE}, "
|
||||
f"fp16={config.fp16_mode}"
|
||||
)
|
||||
|
||||
@@ -113,7 +138,7 @@ class TensorRTEngine:
|
||||
"load", "TensorRT", engine_path, True
|
||||
)
|
||||
self._logger.info(f"TensorRT 引擎加载成功: {engine_path}")
|
||||
self._logger.info(f" 输入: {len(self._inputs)}, 输出: {len(self._outputs)}")
|
||||
self._logger.info(f" 输入: {len(self._inputs)}, 输出: {len(self._outputs)}, batch={self.BATCH_SIZE}")
|
||||
|
||||
return True
|
||||
|
||||
@@ -122,30 +147,31 @@ class TensorRTEngine:
|
||||
return False
|
||||
|
||||
def _allocate_buffers(self):
|
||||
"""Buffer Pool: 初始化阶段一次性分配所有 bindings(工业级关键点)"""
|
||||
"""Buffer Pool: 初始化阶段一次性分配所有 bindings
|
||||
|
||||
对于动态 shape engine,使用配置中的 batch_size 作为默认大小
|
||||
"""
|
||||
self._bindings = []
|
||||
self._inputs = []
|
||||
self._outputs = []
|
||||
self._binding_names = {}
|
||||
|
||||
for binding_idx in range(self._engine.num_bindings):
|
||||
name = self._engine.get_binding_name(binding_idx)
|
||||
shape = list(self._engine.get_binding_shape(binding_idx))
|
||||
dtype = trt.nptype(self._engine.get_binding_dtype(binding_idx))
|
||||
shape = self._engine.get_binding_shape(binding_idx)
|
||||
|
||||
self._binding_names[binding_idx] = name
|
||||
if shape[0] == -1:
|
||||
shape[0] = self.BATCH_SIZE
|
||||
|
||||
shape = tuple(max(1, s) if s < 0 else s for s in shape)
|
||||
size = trt.volume(shape)
|
||||
|
||||
try:
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
except Exception as e:
|
||||
self._logger.warning(f"pagelocked memory 分配失败,回退到普通 numpy: {e}")
|
||||
host_mem = np.zeros(size, dtype=dtype)
|
||||
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
|
||||
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
self._bindings.append(int(device_mem))
|
||||
|
||||
mem_pair = HostDeviceMem(host_mem, device_mem)
|
||||
@@ -159,24 +185,13 @@ class TensorRTEngine:
|
||||
raise RuntimeError("No input bindings found")
|
||||
if len(self._outputs) == 0:
|
||||
raise RuntimeError("No output bindings found")
|
||||
|
||||
self._logger.debug(
|
||||
f"Buffer Pool 分配完成: "
|
||||
f"inputs={[int(i.device) for i in self._inputs]}, "
|
||||
f"outputs={[int(o.device) for o in self._outputs]}"
|
||||
)
|
||||
|
||||
def _get_output_shape(self, binding_idx: int) -> Tuple[int, ...]:
|
||||
"""获取输出的 shape"""
|
||||
name = self._binding_names[binding_idx]
|
||||
return self._engine.get_binding_shape(name)
|
||||
|
||||
def infer(self, input_np: np.ndarray) -> Tuple[List[np.ndarray], float]:
|
||||
def infer(self, input_batch: np.ndarray) -> Tuple[List[np.ndarray], float]:
|
||||
"""
|
||||
执行推理(工业级 async 模式)
|
||||
|
||||
Args:
|
||||
input_np: numpy 输入,shape 必须与 engine 一致
|
||||
input_batch: numpy 输入,shape = [batch, 3, 480, 480],dtype = np.float16
|
||||
|
||||
Returns:
|
||||
tuple: (输出列表, 推理耗时ms)
|
||||
@@ -187,17 +202,20 @@ class TensorRTEngine:
|
||||
if len(self._inputs) == 0:
|
||||
raise RuntimeError("未分配输入 buffer")
|
||||
|
||||
batch_size = input_batch.shape[0]
|
||||
|
||||
start_time = time.perf_counter()
|
||||
|
||||
self._cuda_context.push()
|
||||
|
||||
try:
|
||||
input_np = np.ascontiguousarray(input_np)
|
||||
input_batch = np.ascontiguousarray(input_batch)
|
||||
|
||||
input_name = self._binding_names[0]
|
||||
self._context.set_input_shape(input_name, input_np.shape)
|
||||
input_name = self._engine.get_binding_name(0)
|
||||
actual_shape = list(input_batch.shape)
|
||||
self._context.set_input_shape(input_name, actual_shape)
|
||||
|
||||
np.copyto(self._inputs[0].host, input_np.ravel())
|
||||
np.copyto(self._inputs[0].host, input_batch.ravel())
|
||||
|
||||
cuda.memcpy_htod_async(
|
||||
self._inputs[0].device,
|
||||
@@ -210,28 +228,20 @@ class TensorRTEngine:
|
||||
stream_handle=self._stream.handle
|
||||
)
|
||||
|
||||
results = []
|
||||
for out in self._outputs:
|
||||
cuda.memcpy_dtoh_async(
|
||||
out.host,
|
||||
out.device,
|
||||
self._stream
|
||||
)
|
||||
results.append(out.host.copy())
|
||||
|
||||
self._stream.synchronize()
|
||||
|
||||
inference_time_ms = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
batch_size = input_np.shape[0]
|
||||
self._update_performance_stats(inference_time_ms, batch_size)
|
||||
|
||||
output_shapes = []
|
||||
for i in range(len(self._inputs), self._engine.num_bindings):
|
||||
output_shapes.append(self._get_output_shape(i))
|
||||
|
||||
results = []
|
||||
for idx, out in enumerate(self._outputs):
|
||||
shape = output_shapes[idx] if idx < len(output_shapes) else out.host.shape
|
||||
results.append(out.host.reshape(shape))
|
||||
self._update_performance_stats(inference_time_ms, self.BATCH_SIZE)
|
||||
|
||||
return results, inference_time_ms
|
||||
|
||||
|
||||
Reference in New Issue
Block a user