fix: 修复 TensorRT bindings 问题

- tensorrt_engine.py 添加 pycuda 支持
- CUDA 上下文和流管理
- _is_in_working_hours 支持字符串格式
This commit is contained in:
2026-02-02 14:00:21 +08:00
parent 29d3ea0bc4
commit 0a1d61c1e2
9 changed files with 70748 additions and 21 deletions

View File

@@ -3,6 +3,7 @@ TensorRT推理引擎模块
实现引擎加载、显存优化、异步推理、性能监控
"""
import ctypes
import logging
import threading
import time
@@ -12,10 +13,13 @@ import numpy as np
try:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
TRT_AVAILABLE = True
except ImportError:
TRT_AVAILABLE = False
trt = None
cuda = None
from config.settings import get_settings, InferenceConfig
from utils.logger import get_logger
@@ -50,6 +54,7 @@ class TensorRTEngine:
self._output_bindings = []
self._stream = None
self._released = False
self._cuda_context = None
self._logger = get_logger("tensorrt")
self._lock = threading.Lock()
@@ -90,6 +95,10 @@ class TensorRTEngine:
if self._context is not None:
self._release_resources()
if cuda is not None:
self._cuda_context = cuda.Device(0).make_context()
self._stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
with open(engine_path, "rb") as f:
@@ -241,24 +250,60 @@ class TensorRTEngine:
input_data.shape
)
input_tensor = input_data
output_tensors = []
for output in self._output_bindings:
output_shape = list(output["shape"])
output_shape[0] = batch_size
output_tensor = np.zeros(output_shape, dtype=self._get_numpy_dtype(output["dtype"]))
output_tensors.append(output_tensor)
bindings = [input_tensor] + output_tensors
self._context.execute_v2(bindings=bindings)
inference_time_ms = (time.perf_counter() - start_time) * 1000
self._update_performance_stats(inference_time_ms, batch_size)
return output_tensors, inference_time_ms
if cuda is not None and self._cuda_context is not None:
self._cuda_context.push()
try:
input_data = np.ascontiguousarray(input_data)
input_ptr = cuda.mem_alloc(input_data.nbytes)
cuda.memcpy_htod(input_ptr, input_data)
bindings = [int(input_ptr)]
output_tensors = []
for output in self._output_bindings:
output_shape = list(output["shape"])
output_shape[0] = batch_size
output_tensor = np.zeros(output_shape, dtype=self._get_numpy_dtype(output["dtype"]))
output_tensor = np.ascontiguousarray(output_tensor)
output_ptr = cuda.mem_alloc(output_tensor.nbytes)
cuda.memcpy_htod(output_ptr, output_tensor)
bindings.append(int(output_ptr))
output_tensors.append((output_tensor, output_ptr))
self._context.execute_v2(bindings=bindings)
for output_tensor, output_ptr in output_tensors:
cuda.memcpy_dtoh(output_tensor, output_ptr)
inference_time_ms = (time.perf_counter() - start_time) * 1000
self._update_performance_stats(inference_time_ms, batch_size)
return [t[0] for t in output_tensors], inference_time_ms
finally:
self._cuda_context.pop()
else:
input_tensor = input_data
output_tensors = []
for output in self._output_bindings:
output_shape = list(output["shape"])
output_shape[0] = batch_size
output_tensor = np.zeros(output_shape, dtype=self._get_numpy_dtype(output["dtype"]))
output_tensors.append(output_tensor)
bindings = [int(input_tensor.ctypes.data)] + [int(t.ctypes.data) for t in output_tensors]
self._context.execute_v2(bindings=bindings)
inference_time_ms = (time.perf_counter() - start_time) * 1000
self._update_performance_stats(inference_time_ms, batch_size)
return output_tensors, inference_time_ms
def infer_async(self, input_data: np.ndarray) -> Tuple[List[np.ndarray], float]:
"""
@@ -333,6 +378,14 @@ class TensorRTEngine:
def _release_resources(self):
"""释放资源Python TensorRT 由 GC 管理,无需 destroy"""
if self._cuda_context:
try:
self._cuda_context.pop()
self._cuda_context.detach()
except Exception:
pass
self._cuda_context = None
if self._stream:
try:
self._stream.synchronize()