生成新engine

2026-01-21 13:29:39 +08:00
parent e965b10603
commit 2c00b5afe3
6 changed files with 547 additions and 181 deletions
--- a/inference/engine.py
+++ b/inference/engine.py
@@ -1,4 +1,7 @@
 import os
+
+os.environ["TENSORRT_DISABLE_MYELIN"] = "1"
+
 import time
 from typing import Any, Dict, List, Optional, Tuple

@@ -6,12 +9,146 @@ import cv2
 import numpy as np
 import tensorrt as trt
 import torch
+import onnxruntime as ort
 from ultralytics import YOLO
-from ultralytics.engine.results import Results
+from ultralytics.engine.results import Results, Boxes as UltralyticsBoxes

 from config import get_config


+class ONNXEngine:
+    def __init__(self, onnx_path: Optional[str] = None, device: int = 0):
+        config = get_config()
+        self.onnx_path = onnx_path or config.model.onnx_path
+        self.device = device
+        self.imgsz = tuple(config.model.imgsz)
+        self.conf_thresh = config.model.conf_threshold
+        self.iou_thresh = config.model.iou_threshold
+
+        self.session = None
+        self.input_names = None
+        self.output_names = None
+        self.load_model()
+
+    def load_model(self):
+        if not os.path.exists(self.onnx_path):
+            raise FileNotFoundError(f"ONNX模型文件不存在: {self.onnx_path}")
+
+        providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if self.device >= 0 else ['CPUExecutionProvider']
+        self.session = ort.InferenceSession(self.onnx_path, providers=providers)
+
+        self.input_names = [inp.name for inp in self.session.get_inputs()]
+        self.output_names = [out.name for out in self.session.get_outputs()]
+
+    def preprocess(self, frame: np.ndarray) -> np.ndarray:
+        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        img = cv2.resize(img, self.imgsz)
+
+        img = img.transpose(2, 0, 1).astype(np.float32) / 255.0
+
+        return img
+
+    def postprocess(self, output: np.ndarray, orig_img: np.ndarray) -> List[Results]:
+        c, n = output.shape
+        output = output.T
+
+        boxes = output[:, :4]
+        scores = output[:, 4]
+        classes = output[:, 5:].argmax(axis=1) if output.shape[1] > 5 else np.zeros(len(output), dtype=np.int32)
+
+        mask = scores > self.conf_thresh
+        boxes = boxes[mask]
+        scores = scores[mask]
+        classes = classes[mask]
+
+        if len(boxes) == 0:
+            return [Results(orig_img=orig_img, path="", names={0: "person"})]
+
+        indices = cv2.dnn.NMSBoxes(
+            boxes.tolist(),
+            scores.tolist(),
+            self.conf_thresh,
+            self.iou_thresh,
+        )
+
+        orig_h, orig_w = orig_img.shape[:2]
+        scale_x, scale_y = orig_w / self.imgsz[1], orig_h / self.imgsz[0]
+
+        filtered_boxes = []
+        for idx in indices:
+            if idx >= len(boxes):
+                continue
+            box = boxes[idx]
+            x1, y1, x2, y2 = box
+            w, h = x2 - x1, y2 - y1
+            filtered_boxes.append([
+                int(x1 * scale_x),
+                int(y1 * scale_y),
+                int(w * scale_x),
+                int(h * scale_y),
+                float(scores[idx]),
+                int(classes[idx])
+            ])
+
+        from ultralytics.engine.results import Boxes as BoxesObj
+        if filtered_boxes:
+            box_tensor = torch.tensor(filtered_boxes)
+            boxes_obj = BoxesObj(
+                box_tensor,
+                orig_shape=(orig_h, orig_w)
+            )
+            result = Results(
+                orig_img=orig_img,
+                path="",
+                names={0: "person"},
+                boxes=boxes_obj
+            )
+            return [result]
+
+        return [Results(orig_img=orig_img, path="", names={0: "person"})]
+
+    def inference(self, images: List[np.ndarray]) -> List[Results]:
+        if not images:
+            return []
+
+        batch_imgs = []
+        for frame in images:
+            img = self.preprocess(frame)
+            batch_imgs.append(img)
+
+        batch = np.stack(batch_imgs, axis=0)
+
+        inputs = {self.input_names[0]: batch}
+        outputs = self.session.run(self.output_names, inputs)
+
+        results = []
+        output = outputs[0]
+        if output.shape[0] == 1:
+            result = self.postprocess(output[0], images[0])
+            results.extend(result)
+        else:
+            for i in range(output.shape[0]):
+                result = self.postprocess(output[i], images[i])
+                results.extend(result)
+
+        return results
+
+    def inference_single(self, frame: np.ndarray) -> List[Results]:
+        return self.inference([frame])
+
+    def warmup(self, num_warmup: int = 10):
+        dummy_frame = np.zeros((640, 640, 3), dtype=np.uint8)
+        for _ in range(num_warmup):
+            self.inference_single(dummy_frame)
+
+    def __del__(self):
+        if self.session:
+            try:
+                self.session.end_profiling()
+            except Exception:
+                pass
+
+
 class TensorRTEngine:
    def __init__(self, engine_path: Optional[str] = None, device: int = 0):
        config = get_config()
@@ -25,9 +162,11 @@ class TensorRTEngine:
        self.logger = trt.Logger(trt.Logger.INFO)
        self.engine = None
        self.context = None
-        self.stream = None
+        self.stream = torch.cuda.Stream(device=self.device)
        self.input_buffer = None
        self.output_buffers = []
+        self.input_name = None
+        self.output_name = None

        self._load_engine()

@@ -44,29 +183,39 @@ class TensorRTEngine:
        self.context = self.engine.create_execution_context()

        self.stream = torch.cuda.Stream(device=self.device)
+        self.batch_size = 1

        for i in range(self.engine.num_io_tensors):
            name = self.engine.get_tensor_name(i)
            dtype = self.engine.get_tensor_dtype(name)
-            shape = self.engine.get_tensor_shape(name)
+            shape = list(self.engine.get_tensor_shape(name))

            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
-                self.context.set_tensor_address(name, None)
+                if -1 in shape:
+                    shape = [self.batch_size if d == -1 else d for d in shape]
+                if dtype == trt.float16:
+                    buffer = torch.zeros(shape, dtype=torch.float16, device=self.device)
+                else:
+                    buffer = torch.zeros(shape, dtype=torch.float32, device=self.device)
+                self.input_buffer = buffer
+                self.input_name = name
            else:
+                if -1 in shape:
+                    shape = [self.batch_size if d == -1 else d for d in shape]
                if dtype == trt.float16:
                    buffer = torch.zeros(shape, dtype=torch.float16, device=self.device)
                else:
                    buffer = torch.zeros(shape, dtype=torch.float32, device=self.device)
                self.output_buffers.append(buffer)
-                self.context.set_tensor_address(name, buffer.data_ptr())
+                if self.output_name is None:
+                    self.output_name = name

-        self.context.set_optimization_profile_async(0, self.stream)
+            self.context.set_tensor_address(name, buffer.data_ptr())

-        self.input_buffer = torch.zeros(
-            (1, 3, self.imgsz[0], self.imgsz[1]),
-            dtype=torch.float16 if self.half else torch.float32,
-            device=self.device,
-        )
+        stream_handle = torch.cuda.current_stream(self.device).cuda_stream
+        self.context.set_optimization_profile_async(0, stream_handle)
+
+        self.batch_size = 1

    def preprocess(self, frame: np.ndarray) -> torch.Tensor:
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
@@ -95,16 +244,20 @@ class TensorRTEngine:
                )

        self.context.set_tensor_address(
-            "input", input_tensor.contiguous().data_ptr()
+            self.input_name, input_tensor.contiguous().data_ptr()
        )

+        input_shape = list(input_tensor.shape)
+        self.context.set_input_shape(self.input_name, input_shape)
+
        torch.cuda.synchronize(self.stream)
-        self.context.execute_async_v3(self.stream.handle)
+        self.context.execute_async_v3(self.stream.cuda_stream)
        torch.cuda.synchronize(self.stream)

        results = []
        for i in range(batch_size):
            pred = self.output_buffers[0][i].cpu().numpy()
+            pred = pred.T  # 转置: (8400, 84)
            boxes = pred[:, :4]
            scores = pred[:, 4]
            classes = pred[:, 5].astype(np.int32)
@@ -142,7 +295,7 @@ class TensorRTEngine:
                        orig_img=images[i],
                        path="",
                        names={0: "person"},
-                        boxes=Boxes(
+                        boxes=UltralyticsBoxes(
                            torch.tensor([box_orig + [conf, cls]]),
                            orig_shape=(orig_h, orig_w),
                        ),
@@ -161,9 +314,15 @@ class TensorRTEngine:

    def __del__(self):
        if self.context:
-            self.context.synchronize()
+            try:
+                self.context.synchronize()
+            except Exception:
+                pass
        if self.stream:
-            self.stream.synchronize()
+            try:
+                self.stream.synchronize()
+            except Exception:
+                pass


 class Boxes:
@@ -196,6 +355,15 @@ class Boxes:
        return self.data[:, 5]


+def _check_pt_file_valid(pt_path: str) -> bool:
+    try:
+        with open(pt_path, 'rb') as f:
+            header = f.read(10)
+        return len(header) == 10
+    except Exception:
+        return False
+
+
 class YOLOEngine:
    def __init__(
        self,
@@ -203,38 +371,61 @@ class YOLOEngine:
        device: int = 0,
        use_trt: bool = True,
    ):
-        self.use_trt = use_trt
-        self.device = device
+        self.use_trt = False
+        self.onnx_engine = None
        self.trt_engine = None
+        self.device = device
+        config = get_config()

-        if not use_trt:
-            if model_path:
-                pt_path = model_path
-            elif hasattr(get_config().model, 'pt_model_path'):
-                pt_path = get_config().model.pt_model_path
-            else:
-                pt_path = get_config().model.engine_path.replace(".engine", ".pt")
-            self.model = YOLO(pt_path)
-            self.model.to(device)
-        else:
+        if use_trt:
            try:
                self.trt_engine = TensorRTEngine(device=device)
                self.trt_engine.warmup()
+                self.use_trt = True
+                print("TensorRT引擎加载成功")
+                return
            except Exception as e:
-                print(f"TensorRT加载失败，回退到PyTorch: {e}")
-                self.use_trt = False
-                if model_path:
-                    pt_path = model_path
-                elif hasattr(get_config().model, 'pt_model_path'):
-                    pt_path = get_config().model.pt_model_path
-                else:
-                    pt_path = get_config().model.engine_path.replace(".engine", ".pt")
+                print(f"TensorRT加载失败: {e}")
+
+        try:
+            onnx_path = config.model.onnx_path
+            if os.path.exists(onnx_path):
+                self.onnx_engine = ONNXEngine(device=device)
+                self.onnx_engine.warmup()
+                print("ONNX引擎加载成功")
+                return
+            else:
+                print(f"ONNX模型不存在: {onnx_path}")
+        except Exception as e:
+            print(f"ONNX加载失败: {e}")
+
+        try:
+            pt_path = model_path or config.model.pt_model_path
+            if os.path.exists(pt_path) and _check_pt_file_valid(pt_path):
                self.model = YOLO(pt_path)
                self.model.to(device)
+                print(f"PyTorch模型加载成功: {pt_path}")
+            else:
+                raise FileNotFoundError(f"PT文件无效或不存在: {pt_path}")
+        except Exception as e:
+            print(f"PyTorch加载失败: {e}")
+            raise RuntimeError("所有模型加载方式均失败")

    def __call__(self, frame: np.ndarray, **kwargs) -> List[Results]:
-        if self.use_trt:
-            return self.trt_engine.inference_single(frame)
+        if self.use_trt and self.trt_engine:
+            try:
+                return self.trt_engine.inference_single(frame)
+            except Exception as e:
+                print(f"TensorRT推理失败，切换到ONNX: {e}")
+                self.use_trt = False
+                if self.onnx_engine:
+                    return self.onnx_engine.inference_single(frame)
+                elif self.model:
+                    return self.model(frame, imgsz=get_config().model.imgsz, **kwargs)
+                else:
+                    return []
+        elif self.onnx_engine:
+            return self.onnx_engine.inference_single(frame)
        else:
            results = self.model(frame, imgsz=get_config().model.imgsz, **kwargs)
            return results
@@ -242,3 +433,5 @@ class YOLOEngine:
    def __del__(self):
        if self.trt_engine:
            del self.trt_engine
+        if self.onnx_engine:
+            del self.onnx_engine