"""
优化压力测试可视化模块
专门为原生 TensorRT 测试结果生成图表
"""

import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any

# 设置字体
plt.rcParams['font.family'] = ['Arial', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 10


def load_optimized_results(results_dir: str) -> pd.DataFrame:
    """加载优化测试结果"""
    results_path = Path(results_dir)
    
    # 查找最新的结果文件 (支持两种格式)
    json_files = list(results_path.glob("optimized_results_*.json")) + \
                list(results_path.glob("ultralytics_optimized_*.json"))
    
    if not json_files:
        raise FileNotFoundError("未找到优化测试结果文件")
    
    latest_file = max(json_files, key=lambda x: x.stat().st_mtime)
    
    with open(latest_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    return pd.DataFrame(data)


def create_performance_comparison(df: pd.DataFrame, output_dir: str) -> str:
    """创建性能对比图表"""
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('RTX 3050 Optimized Performance Analysis', fontsize=20, fontweight='bold')
    
    # 1. 最大 FPS 对比 (优化前 vs 优化后)
    resolutions = [320, 480]
    original_fps = [33.8, 33.9]  # 之前的结果
    
    # 从测试结果中获取最大 FPS
    optimized_fps = []
    for res in resolutions:
        res_data = df[df['resolution'] == res]
        if len(res_data) > 0:
            max_fps = res_data['actual_fps'].max()
            optimized_fps.append(max_fps)
        else:
            optimized_fps.append(0)
    
    x = np.arange(len(resolutions))
    width = 0.35
    
    bars1 = ax1.bar(x - width/2, original_fps, width, label='Original (Ultralytics)', 
                   color='#FF6B6B', alpha=0.8)
    bars2 = ax1.bar(x + width/2, optimized_fps, width, label='Optimized (Native TensorRT)', 
                   color='#4ECDC4', alpha=0.8)
    
    ax1.set_title('Max FPS Comparison', fontweight='bold')
    ax1.set_xlabel('Resolution')
    ax1.set_ylabel('FPS')
    ax1.set_xticks(x)
    ax1.set_xticklabels([f'{res}x{res}' for res in resolutions])
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 添加提升倍数标签
    for i, (orig, opt) in enumerate(zip(original_fps, optimized_fps)):
        if orig > 0 and opt > 0:
            improvement = opt / orig
            ax1.text(i, max(orig, opt) + 5, f'{improvement:.1f}x', 
                    ha='center', va='bottom', fontweight='bold', color='green')
    
    # 2. GPU 利用率对比
    gpu_utils_orig = [30, 34]  # 原始测试的 GPU 利用率
    gpu_utils_opt = []
    
    for res in resolutions:
        res_data = df[df['resolution'] == res]
        if len(res_data) > 0:
            max_util = res_data['gpu_utilization'].max()
            gpu_utils_opt.append(max_util)
        else:
            gpu_utils_opt.append(0)
    
    bars3 = ax2.bar(x - width/2, gpu_utils_orig, width, label='Original', 
                   color='#FF6B6B', alpha=0.8)
    bars4 = ax2.bar(x + width/2, gpu_utils_opt, width, label='Optimized', 
                   color='#4ECDC4', alpha=0.8)
    
    ax2.set_title('GPU Utilization Comparison', fontweight='bold')
    ax2.set_xlabel('Resolution')
    ax2.set_ylabel('GPU Utilization (%)')
    ax2.set_xticks(x)
    ax2.set_xticklabels([f'{res}x{res}' for res in resolutions])
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    ax2.set_ylim(0, 100)
    
    # 添加目标线
    ax2.axhline(y=70, color='green', linestyle='--', alpha=0.7, label='Target (70%)')
    
    # 3. 批次大小 vs 性能 (支持两种字段名)
    batch_field = 'batch_size' if 'batch_size' in df.columns else None
    if batch_field:
        batch_perf = df.groupby(batch_field)['actual_fps'].mean()
        
        ax3.plot(batch_perf.index, batch_perf.values, marker='o', linewidth=3, 
                markersize=8, color='#95E1D3')
        ax3.set_title('Batch Size vs Performance', fontweight='bold')
        ax3.set_xlabel('Batch Size')
        ax3.set_ylabel('Average FPS')
        ax3.grid(True, alpha=0.3)
    
    # 4. 流数量/线程数量 vs 性能 (支持两种字段名)
    parallel_field = 'num_streams' if 'num_streams' in df.columns else 'num_threads'
    if parallel_field in df.columns:
        parallel_perf = df.groupby(parallel_field)['actual_fps'].mean()
        
        ax4.plot(parallel_perf.index, parallel_perf.values, marker='s', linewidth=3, 
                markersize=8, color='#F38BA8')
        
        title = 'Number of Streams vs Performance' if parallel_field == 'num_streams' else 'Number of Threads vs Performance'
        xlabel = 'Number of CUDA Streams' if parallel_field == 'num_streams' else 'Number of Threads'
        
        ax4.set_title(title, fontweight='bold')
        ax4.set_xlabel(xlabel)
        ax4.set_ylabel('Average FPS')
        ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    output_file = Path(output_dir) / "optimized_performance_comparison.png"
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    
    return str(output_file)


def create_optimization_analysis(df: pd.DataFrame, output_dir: str) -> str:
    """创建优化分析图表"""
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('TensorRT Optimization Analysis', fontsize=16, fontweight='bold')
    
    # 1. 延迟分布对比
    if 'avg_inference_time_ms' in df.columns:
        latencies = df['avg_inference_time_ms'].dropna()
        
        ax1.hist(latencies, bins=20, alpha=0.7, color='#95E1D3', edgecolor='black')
        ax1.axvline(latencies.mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {latencies.mean():.2f}ms')
        ax1.set_title('Inference Latency Distribution', fontweight='bold')
        ax1.set_xlabel('Latency (ms)')
        ax1.set_ylabel('Frequency')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
    
    # 2. 吞吐量 vs GPU 利用率
    if 'gpu_utilization' in df.columns and 'actual_fps' in df.columns:
        ax2.scatter(df['gpu_utilization'], df['actual_fps'], 
                   c=df['batch_size'] if 'batch_size' in df.columns else 'blue', 
                   cmap='viridis', s=100, alpha=0.7)
        ax2.set_title('Throughput vs GPU Utilization', fontweight='bold')
        ax2.set_xlabel('GPU Utilization (%)')
        ax2.set_ylabel('Throughput (FPS)')
        ax2.grid(True, alpha=0.3)
        
        if 'batch_size' in df.columns:
            cbar = plt.colorbar(ax2.collections[0], ax=ax2)
            cbar.set_label('Batch Size')
    
    # 3. 内存使用分析
    if 'memory_used_mb' in df.columns:
        memory_by_batch = df.groupby('batch_size')['memory_used_mb'].mean() if 'batch_size' in df.columns else df['memory_used_mb']
        
        if isinstance(memory_by_batch, pd.Series) and len(memory_by_batch) > 1:
            ax3.bar(range(len(memory_by_batch)), memory_by_batch.values, 
                   color='#FFEAA7', alpha=0.8)
            ax3.set_title('Memory Usage by Batch Size', fontweight='bold')
            ax3.set_xlabel('Batch Size')
            ax3.set_ylabel('Memory Usage (MB)')
            ax3.set_xticks(range(len(memory_by_batch)))
            ax3.set_xticklabels(memory_by_batch.index)
            
            # 添加总显存线
            ax3.axhline(y=8192, color='red', linestyle='--', alpha=0.7, label='Total VRAM (8GB)')
            ax3.legend()
        else:
            ax3.text(0.5, 0.5, 'Insufficient data for memory analysis', 
                    ha='center', va='center', transform=ax3.transAxes)
    
    # 4. 优化效果总结
    ax4.text(0.05, 0.95, 'Optimization Results Summary', 
            fontsize=16, fontweight='bold', transform=ax4.transAxes)
    
    # 计算改进指标
    if len(df) > 0:
        max_fps = df['actual_fps'].max()
        max_gpu_util = df['gpu_utilization'].max()
        avg_latency = df['avg_inference_time_ms'].mean() if 'avg_inference_time_ms' in df.columns else 0
        
        original_max_fps = 33.8
        original_gpu_util = 30
        
        fps_improvement = max_fps / original_max_fps if original_max_fps > 0 else 0
        gpu_improvement = max_gpu_util / original_gpu_util if original_gpu_util > 0 else 0
        
        summary_text = [
            f'🚀 Max FPS: {max_fps:.1f} (vs {original_max_fps:.1f})',
            f'📈 FPS Improvement: {fps_improvement:.1f}x',
            f'🔥 Max GPU Util: {max_gpu_util:.1f}% (vs {original_gpu_util}%)',
            f'📊 GPU Improvement: {gpu_improvement:.1f}x',
            f'⚡ Avg Latency: {avg_latency:.2f}ms',
            '',
            '✅ Optimization Success!' if fps_improvement > 2 else '⚠️ Needs More Optimization',
            f'Target: 70%+ GPU utilization',
            f'Achieved: {max_gpu_util:.1f}% GPU utilization'
        ]
        
        for i, text in enumerate(summary_text):
            ax4.text(0.05, 0.85 - i*0.08, text, fontsize=12, 
                    transform=ax4.transAxes)
    
    ax4.set_xlim(0, 1)
    ax4.set_ylim(0, 1)
    ax4.axis('off')
    
    plt.tight_layout()
    output_file = Path(output_dir) / "optimization_analysis.png"
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    
    return str(output_file)


def create_deployment_recommendations(df: pd.DataFrame, output_dir: str) -> str:
    """创建部署建议图表"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    fig.suptitle('Optimized Deployment Recommendations', fontsize=16, fontweight='bold')
    
    # 1. 最优配置热力图
    if 'batch_size' in df.columns and 'num_streams' in df.columns:
        # 创建配置性能矩阵
        pivot_data = df.pivot_table(
            values='actual_fps', 
            index='batch_size', 
            columns='num_streams', 
            aggfunc='mean'
        )
        
        if not pivot_data.empty:
            im1 = ax1.imshow(pivot_data.values, cmap='RdYlGn', aspect='auto')
            ax1.set_title('Performance Heatmap (FPS)', fontweight='bold')
            ax1.set_xlabel('Number of Streams')
            ax1.set_ylabel('Batch Size')
            ax1.set_xticks(range(len(pivot_data.columns)))
            ax1.set_xticklabels(pivot_data.columns)
            ax1.set_yticks(range(len(pivot_data.index)))
            ax1.set_yticklabels(pivot_data.index)
            
            # 添加数值标签
            for i in range(len(pivot_data.index)):
                for j in range(len(pivot_data.columns)):
                    if not np.isnan(pivot_data.iloc[i, j]):
                        ax1.text(j, i, f'{pivot_data.iloc[i, j]:.1f}',
                               ha="center", va="center", color="black", fontweight='bold')
            
            plt.colorbar(im1, ax=ax1, label='FPS')
    
    # 2. 推荐配置
    ax2.text(0.05, 0.95, 'Recommended Configurations', 
            fontsize=16, fontweight='bold', transform=ax2.transAxes)
    
    # 基于测试结果生成推荐
    if len(df) > 0:
        # 找到最佳配置
        best_config = df.loc[df['actual_fps'].idxmax()]
        
        recommendations = [
            '🏆 Best Performance Configuration:',
            f'  • Resolution: {best_config["resolution"]}x{best_config["resolution"]}',
            f'  • Batch Size: {best_config.get("batch_size", "N/A")}',
            f'  • Streams: {best_config.get("num_streams", "N/A")}',
            f'  • Performance: {best_config["actual_fps"]:.1f} FPS',
            f'  • GPU Util: {best_config["gpu_utilization"]:.1f}%',
            '',
            '💡 Deployment Scenarios:',
            '',
            '🎯 High Throughput (Max FPS):',
            f'  • Use batch size 16-32',
            f'  • Use 4-8 CUDA streams',
            f'  • Expected: {best_config["actual_fps"]:.0f}+ FPS',
            '',
            '⚖️ Balanced (GPU ~70%):',
            f'  • Use batch size 8-16',
            f'  • Use 2-4 CUDA streams',
            f'  • Expected: {best_config["actual_fps"]*0.7:.0f} FPS',
            '',
            '🔋 Power Efficient (GPU ~50%):',
            f'  • Use batch size 4-8',
            f'  • Use 2 CUDA streams',
            f'  • Expected: {best_config["actual_fps"]*0.5:.0f} FPS'
        ]
        
        for i, rec in enumerate(recommendations):
            ax2.text(0.05, 0.85 - i*0.04, rec, fontsize=10, 
                    transform=ax2.transAxes)
    
    ax2.set_xlim(0, 1)
    ax2.set_ylim(0, 1)
    ax2.axis('off')
    
    plt.tight_layout()
    output_file = Path(output_dir) / "deployment_recommendations.png"
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    plt.close()
    
    return str(output_file)


def generate_optimized_charts(results_dir: str) -> List[str]:
    """生成优化测试的所有图表"""
    try:
        df = load_optimized_results(results_dir)
        
        chart_files = []
        
        # 1. 性能对比
        chart_files.append(create_performance_comparison(df, results_dir))
        
        # 2. 优化分析
        chart_files.append(create_optimization_analysis(df, results_dir))
        
        # 3. 部署建议
        chart_files.append(create_deployment_recommendations(df, results_dir))
        
        return chart_files
        
    except Exception as e:
        print(f"生成图表失败: {e}")
        return []


if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        results_dir = sys.argv[1]
    else:
        results_dir = "./optimized_stress_results"
    
    chart_files = generate_optimized_charts(results_dir)
    
    if chart_files:
        print(f"✅ 生成了 {len(chart_files)} 个图表:")
        for file in chart_files:
            print(f"  📊 {file}")
    else:
        print("❌ 未生成任何图表")