"""
Batch 组装和 GPU 预处理模块
"""

import time
import queue
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass

import cv2
import numpy as np

from .utils import setup_logging

logger = setup_logging()


class GPUPreprocessor:
    """GPU 图像预处理器"""
    
    def __init__(self, target_size: Tuple[int, int], device_id: int = 0, use_gpu: bool = True):
        self.target_size = target_size
        self.device_id = device_id
        self.use_gpu = use_gpu
        self._init_gpu()
    
    def _init_gpu(self):
        self._gpu_available = False
        
        if not self.use_gpu:
            logger.info("GPU 预处理已禁用，使用 CPU")
            return
        
        try:
            if cv2.cuda.getCudaEnabledDeviceCount() > 0:
                self._gpu_available = True
                logger.info(f"OpenCV CUDA 可用，设备数: {cv2.cuda.getCudaEnabledDeviceCount()}")
            else:
                logger.warning("OpenCV CUDA 不可用，使用 CPU 预处理")
        except Exception as e:
            logger.warning(f"GPU 初始化失败: {e}，使用 CPU 预处理")
    
    def preprocess_single(self, frame: np.ndarray) -> np.ndarray:
        h, w = self.target_size
        
        if self._gpu_available:
            return self._preprocess_gpu(frame, h, w)
        else:
            return self._preprocess_cpu(frame, h, w)
    
    def _preprocess_cpu(self, frame: np.ndarray, h: int, w: int) -> np.ndarray:
        resized = cv2.resize(frame, (w, h), interpolation=cv2.INTER_LINEAR)
        rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
        normalized = rgb.astype(np.float32) / 255.0
        transposed = np.transpose(normalized, (2, 0, 1))
        return transposed
    
    def _preprocess_gpu(self, frame: np.ndarray, h: int, w: int) -> np.ndarray:
        try:
            gpu_frame = cv2.cuda_GpuMat()
            gpu_frame.upload(frame)
            gpu_resized = cv2.cuda.resize(gpu_frame, (w, h), interpolation=cv2.INTER_LINEAR)
            gpu_rgb = cv2.cuda.cvtColor(gpu_resized, cv2.COLOR_BGR2RGB)
            rgb = gpu_rgb.download()
            normalized = rgb.astype(np.float32) / 255.0
            transposed = np.transpose(normalized, (2, 0, 1))
            return transposed
        except Exception as e:
            logger.warning(f"GPU 预处理失败: {e}，回退到 CPU")
            return self._preprocess_cpu(frame, h, w)
    
    def preprocess_batch(self, frames: List[np.ndarray], stream: Any = None) -> np.ndarray:
        if not frames:
            return np.array([])
        processed = [self.preprocess_single(frame) for frame in frames]
        return np.stack(processed, axis=0)


@dataclass
class FrameInfo:
    """帧信息"""
    frame: np.ndarray
    timestamp: float
    source_id: str


class BatchAssembler:
    """Batch 组装器"""
    
    def __init__(
        self,
        frame_queues: Dict[str, queue.Queue],
        batch_size: int,
        imgsz: Tuple[int, int],
        use_gpu_preprocess: bool = True,
        device_id: int = 0
    ):
        self.frame_queues = frame_queues
        self.batch_size = batch_size
        self.imgsz = imgsz
        
        self.preprocessor = GPUPreprocessor(
            target_size=imgsz,
            device_id=device_id,
            use_gpu=use_gpu_preprocess
        )
        
        self._total_frames = 0
        self._dropped_frames = 0
        self._incomplete_batches = 0
        
        self._queue_keys = list(frame_queues.keys())
        self._current_index = 0
    
    def assemble_batch(self, timeout: float = 0.1) -> Optional[Tuple[np.ndarray, List[FrameInfo]]]:
        frames = []
        frame_infos = []
        
        start_time = time.time()
        
        while len(frames) < self.batch_size:
            if time.time() - start_time > timeout:
                break
            
            got_frame = False
            for _ in range(len(self._queue_keys)):
                source_id = self._queue_keys[self._current_index]
                self._current_index = (self._current_index + 1) % len(self._queue_keys)
                
                q = self.frame_queues[source_id]
                try:
                    frame, timestamp, src_id = q.get_nowait()
                    frames.append(frame)
                    frame_infos.append(FrameInfo(frame=frame, timestamp=timestamp, source_id=src_id))
                    got_frame = True
                    
                    if len(frames) >= self.batch_size:
                        break
                except queue.Empty:
                    continue
            
            if not got_frame:
                time.sleep(0.001)
        
        if not frames:
            return None
        
        self._total_frames += len(frames)
        
        if len(frames) < self.batch_size:
            self._incomplete_batches += 1
        
        batch = self.preprocessor.preprocess_batch(frames)
        
        return batch, frame_infos
    
    def get_drop_rate(self) -> float:
        if self._total_frames == 0:
            return 0.0
        return self._dropped_frames / self._total_frames * 100
    
    def get_stats(self) -> Dict[str, Any]:
        return {
            "total_frames": self._total_frames,
            "dropped_frames": self._dropped_frames,
            "incomplete_batches": self._incomplete_batches,
            "drop_rate": self.get_drop_rate(),
        }
    
    def reset_stats(self):
        self._total_frames = 0
        self._dropped_frames = 0
        self._incomplete_batches = 0