# -*- coding: utf-8 -*-
"""
Wheel-Based Accuracy Benchmark - Similarity-aware emotion evaluation

Features:
- Uses Emotion Wheel for similarity scoring
- Exact match = 1.0, Same category = 0.8, Adjacent = 0.5, Opposite = 0.0
- Shows detected emotion for each test
- Calculates weighted accuracy based on similarity
- Identifies contradictory detections vs similar mistakes

Version: 3.0.0
"""

import sys
import os
import time
from typing import Dict, List, Tuple, Any, Optional
from dataclasses import dataclass, field
from collections import defaultdict
from datetime import datetime

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from avatar.emotion_wheel import EmotionWheel, get_emotion_wheel, EmotionCategory


@dataclass
class WheelTestResult:
    """Result for a single test with wheel-based scoring"""
    emotion: str
    text: str
    expected_category: str
    detected_emotion: str
    detected_category: Optional[str]
    similarity_score: float
    relationship: str  # "exact", "same_category", "adjacent", "distant", "opposite"
    inference_time_ms: float
    emoji: str


@dataclass
class WheelBenchmarkResults:
    """Aggregated wheel-based benchmark results"""
    total_tests: int = 0
    
    # Weighted accuracy (using similarity scores)
    weighted_accuracy: float = 0.0
    
    # Traditional accuracy
    exact_matches: int = 0
    exact_accuracy: float = 0.0
    
    # Similarity-based metrics
    same_category_matches: int = 0
    adjacent_matches: int = 0
    distant_matches: int = 0
    opposite_matches: int = 0  # These are the real failures
    
    # Acceptable = exact + same_category + adjacent
    acceptable_accuracy: float = 0.0
    
    # Contradiction rate (opposite emotions)
    contradiction_rate: float = 0.0
    
    # Per-emotion results
    emotion_results: Dict[str, List[WheelTestResult]] = field(default_factory=dict)
    emotion_scores: Dict[str, float] = field(default_factory=dict)
    
    # Category performance
    category_accuracy: Dict[str, float] = field(default_factory=dict)
    
    # Timing
    avg_inference_time_ms: float = 0.0
    
    # Failed emotions (>50% opposite)
    failed_emotions: List[str] = field(default_factory=list)
    
    # Confusion matrix by category
    category_confusion: Dict[str, Dict[str, int]] = field(default_factory=dict)


class WheelBenchmark:
    """
    Wheel-based emotion benchmark with similarity scoring
    
    Uses EmotionWheel to determine:
    - Exact matches (same emotion)
    - Same category (similar emotions)
    - Adjacent category (related emotions)  
    - Opposite category (contradictory emotions - failures)
    """
    
    def __init__(self, analyzer, emoji_mapper):
        """
        Initialize benchmark
        
        Args:
            analyzer: Sentiment analyzer instance
            emoji_mapper: EmojiMapper instance
        """
        self.analyzer = analyzer
        self.mapper = emoji_mapper
        self.wheel = get_emotion_wheel()
    
    def _extract_emotion(self, result: Dict[str, Any]) -> str:
        """Extract emotion label from analyzer result"""
        # Try different possible keys
        for key in ["label", "emotion", "detected_emotion"]:
            if key in result:
                return result[key].lower()
        return "neutral"
    
    def run_single_test(self, text: str, expected_emotion: str, expected_category: str) -> WheelTestResult:
        """Run single test with wheel-based scoring"""
        
        start_time = time.perf_counter()
        result = self.analyzer.analyze(text)
        end_time = time.perf_counter()
        
        inference_time_ms = (end_time - start_time) * 1000
        
        detected_emotion = self._extract_emotion(result)
        detected_category = self.wheel.get_category(detected_emotion)
        detected_cat_name = detected_category.value if detected_category else "unknown"
        
        # Get similarity score and relationship
        score, relationship = self.wheel.get_similarity_score(expected_emotion, detected_emotion)
        
        # Get emoji
        emoji = self.mapper.get_emoji(detected_emotion)
        
        return WheelTestResult(
            emotion=expected_emotion,
            text=text,
            expected_category=expected_category,
            detected_emotion=detected_emotion,
            detected_category=detected_cat_name,
            similarity_score=score,
            relationship=relationship,
            inference_time_ms=inference_time_ms,
            emoji=emoji
        )
    
    def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> WheelBenchmarkResults:
        """
        Run full benchmark with wheel-based scoring
        
        Args:
            test_data: Dict mapping emotion -> [(text, category), ...]
            
        Returns:
            WheelBenchmarkResults with similarity-aware metrics
        """
        results = WheelBenchmarkResults()
        results.emotion_results = defaultdict(list)
        results.category_confusion = defaultdict(lambda: defaultdict(int))
        
        all_times = []
        all_scores = []
        
        for emotion, test_cases in test_data.items():
            emotion_scores = []
            
            for text, expected_category in test_cases:
                test_result = self.run_single_test(text, emotion, expected_category)
                results.emotion_results[emotion].append(test_result)
                
                results.total_tests += 1
                all_times.append(test_result.inference_time_ms)
                all_scores.append(test_result.similarity_score)
                emotion_scores.append(test_result.similarity_score)
                
                # Count by relationship type
                if test_result.relationship == "exact":
                    results.exact_matches += 1
                elif test_result.relationship == "same_category":
                    results.same_category_matches += 1
                elif test_result.relationship == "adjacent":
                    results.adjacent_matches += 1
                elif test_result.relationship == "opposite":
                    results.opposite_matches += 1
                else:
                    results.distant_matches += 1
                
                # Update category confusion matrix
                results.category_confusion[expected_category][test_result.detected_category] += 1
            
            # Calculate per-emotion score
            if emotion_scores:
                avg_score = sum(emotion_scores) / len(emotion_scores)
                results.emotion_scores[emotion] = avg_score
                
                # Check if emotion failed (majority opposite)
                opposite_count = sum(1 for r in results.emotion_results[emotion] 
                                    if r.relationship == "opposite")
                if opposite_count > len(emotion_scores) / 2:
                    results.failed_emotions.append(emotion)
        
        # Calculate overall metrics
        if results.total_tests > 0:
            results.weighted_accuracy = sum(all_scores) / len(all_scores)
            results.exact_accuracy = results.exact_matches / results.total_tests
            
            acceptable = results.exact_matches + results.same_category_matches + results.adjacent_matches
            results.acceptable_accuracy = acceptable / results.total_tests
            
            results.contradiction_rate = results.opposite_matches / results.total_tests
        
        # Calculate category accuracy
        for category in set(tc[1] for cases in test_data.values() for tc in cases):
            category_tests = [
                r for emotion_results in results.emotion_results.values()
                for r in emotion_results if r.expected_category == category
            ]
            if category_tests:
                category_score = sum(r.similarity_score for r in category_tests) / len(category_tests)
                results.category_accuracy[category] = category_score
        
        # Calculate timing
        if all_times:
            results.avg_inference_time_ms = sum(all_times) / len(all_times)
        
        return results
    
    def generate_detailed_report(self, results: WheelBenchmarkResults) -> str:
        """Generate detailed report showing detected vs expected"""
        
        lines = [
            "╔══════════════════════════════════════════════════════════════════════════════╗",
            "║           WHEEL-BASED EMOTION RECOGNITION EVALUATION REPORT                 ║",
            "╠══════════════════════════════════════════════════════════════════════════════╣",
            f"║  Total Tests: {results.total_tests:<10}                                              ║",
            f"║  Weighted Accuracy: {results.weighted_accuracy:.1%}                                          ║",
            f"║  Exact Match Rate: {results.exact_accuracy:.1%}                                           ║",
            f"║  Acceptable Rate: {results.acceptable_accuracy:.1%} (exact + similar + adjacent)           ║",
            f"║  Contradiction Rate: {results.contradiction_rate:.1%} (opposite emotions)                   ║",
            "╚══════════════════════════════════════════════════════════════════════════════╝",
            "",
            "┌──────────────────────────────────────────────────────────────────────────────┐",
            "│ SCORING BREAKDOWN                                                            │",
            "├──────────────────────────────────────────────────────────────────────────────┤",
            f"│ ✅ Exact Matches:      {results.exact_matches:4} ({results.exact_matches/results.total_tests*100:5.1f}%) - Score: 1.0           │",
            f"│ 🟢 Same Category:      {results.same_category_matches:4} ({results.same_category_matches/results.total_tests*100:5.1f}%) - Score: 0.8           │",
            f"│ 🟡 Adjacent Category:  {results.adjacent_matches:4} ({results.adjacent_matches/results.total_tests*100:5.1f}%) - Score: 0.5           │",
            f"│ 🟠 Distant Category:   {results.distant_matches:4} ({results.distant_matches/results.total_tests*100:5.1f}%) - Score: 0.2           │",
            f"│ ❌ Opposite (FAIL):    {results.opposite_matches:4} ({results.opposite_matches/results.total_tests*100:5.1f}%) - Score: 0.0           │",
            "└──────────────────────────────────────────────────────────────────────────────┘",
            "",
        ]
        
        # Category performance
        lines.extend([
            "┌──────────────────────────────────────────────────────────────────────────────┐",
            "│ CATEGORY PERFORMANCE                                                         │",
            "├──────────────────────────────────────────────────────────────────────────────┤",
        ])
        
        for category, score in sorted(results.category_accuracy.items(), key=lambda x: -x[1]):
            bar = "█" * int(score * 30) + "░" * (30 - int(score * 30))
            lines.append(f"│ {category:<15} {bar} {score:.1%}  │")
        
        lines.append("└──────────────────────────────────────────────────────────────────────────────┘")
        lines.append("")
        
        # Show some example results for each category
        lines.extend([
            "┌──────────────────────────────────────────────────────────────────────────────┐",
            "│ SAMPLE RESULTS (Expected → Detected)                                         │",
            "├──────────────────────────────────────────────────────────────────────────────┤",
        ])
        
        # Show a few examples from each relationship type
        examples = {"exact": [], "same_category": [], "adjacent": [], "opposite": []}
        
        for emotion_results in results.emotion_results.values():
            for r in emotion_results:
                if r.relationship in examples and len(examples[r.relationship]) < 3:
                    examples[r.relationship].append(r)
        
        for rel_type, emoji_prefix in [("exact", "✅"), ("same_category", "🟢"), 
                                        ("adjacent", "🟡"), ("opposite", "❌")]:
            if examples[rel_type]:
                lines.append(f"│ {emoji_prefix} {rel_type.upper()}:                                                         │")
                for ex in examples[rel_type]:
                    text_short = ex.text[:35] + "..." if len(ex.text) > 35 else ex.text
                    lines.append(f"│   '{text_short:<38}'                │")
                    lines.append(f"│     Expected: {ex.emotion:<12} → Detected: {ex.detected_emotion:<12} {ex.emoji}  │")
        
        lines.append("└──────────────────────────────────────────────────────────────────────────────┘")
        
        # Failed emotions
        if results.failed_emotions:
            lines.extend([
                "",
                "┌──────────────────────────────────────────────────────────────────────────────┐",
                "│ ⚠️ FAILED EMOTIONS (>50% contradictory detections)                            │",
                "├──────────────────────────────────────────────────────────────────────────────┤",
            ])
            for em in results.failed_emotions[:10]:
                score = results.emotion_scores.get(em, 0)
                lines.append(f"│   ❌ {em:<25} Score: {score:.2f}                           │")
            lines.append("└──────────────────────────────────────────────────────────────────────────────┘")
        
        return "\n".join(lines)
    
    def generate_emotion_detail_table(self, results: WheelBenchmarkResults) -> str:
        """Generate detailed table for all emotions"""
        
        lines = [
            "# Emotion Recognition Detail Report",
            "",
            "| Emotion | Score | Exact | Similar | Adjacent | Opposite | Status |",
            "|---------|-------|-------|---------|----------|----------|--------|",
        ]
        
        for emotion in sorted(results.emotion_scores.keys()):
            score = results.emotion_scores[emotion]
            emotion_tests = results.emotion_results[emotion]
            
            exact = sum(1 for r in emotion_tests if r.relationship == "exact")
            same = sum(1 for r in emotion_tests if r.relationship == "same_category")
            adj = sum(1 for r in emotion_tests if r.relationship == "adjacent")
            opp = sum(1 for r in emotion_tests if r.relationship == "opposite")
            total = len(emotion_tests)
            
            if score >= 0.8:
                status = "✅ PASS"
            elif score >= 0.5:
                status = "⚠️ OK"
            else:
                status = "❌ FAIL"
            
            lines.append(
                f"| {emotion:<15} | {score:.2f} | "
                f"{exact}/{total} | {same}/{total} | {adj}/{total} | {opp}/{total} | {status} |"
            )
        
        return "\n".join(lines)


def run_wheel_evaluation():
    """Run the wheel-based evaluation"""
    
    print("=" * 80)
    print("WHEEL-BASED EMOTION RECOGNITION EVALUATION")
    print("=" * 80)
    print()
    
    # Import components
    from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer
    from avatar.sentiment_emoji_map import EmojiMapper
    from evaluation.emotion_test_suite_v3 import EmotionTestSuiteV3
    
    # Initialize
    print("Loading components...")
    analyzer = MultiEmotionAnalyzer()
    mapper = EmojiMapper()
    suite = EmotionTestSuiteV3()
    benchmark = WheelBenchmark(analyzer, mapper)
    
    print(f"Test Suite V{suite.VERSION}")
    print(f"Emotions: {suite.get_emotion_count()}, Tests: {suite.get_test_count()}")
    print()
    
    # Run benchmark
    print("Running benchmark...")
    start = time.time()
    results = benchmark.run_benchmark(suite.EMOTION_TEST_DATA)
    elapsed = time.time() - start
    print(f"Completed in {elapsed:.2f}s")
    print()
    
    # Show report
    print(benchmark.generate_detailed_report(results))
    
    # Save detailed markdown report
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    report_dir = "evaluation/reports"
    os.makedirs(report_dir, exist_ok=True)
    
    # Save emotion detail table
    detail_path = os.path.join(report_dir, f"wheel_evaluation_{timestamp}.md")
    
    with open(detail_path, "w", encoding="utf-8") as f:
        f.write(f"# Wheel-Based Emotion Evaluation Report\n\n")
        f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write(f"## Summary\n\n")
        f.write(f"- **Total Tests:** {results.total_tests}\n")
        f.write(f"- **Weighted Accuracy:** {results.weighted_accuracy:.1%}\n")
        f.write(f"- **Exact Match Rate:** {results.exact_accuracy:.1%}\n")
        f.write(f"- **Acceptable Rate:** {results.acceptable_accuracy:.1%}\n")
        f.write(f"- **Contradiction Rate:** {results.contradiction_rate:.1%}\n\n")
        f.write(f"## Scoring Breakdown\n\n")
        f.write(f"| Relationship | Count | Percentage | Score |\n")
        f.write(f"|--------------|-------|------------|-------|\n")
        f.write(f"| ✅ Exact | {results.exact_matches} | {results.exact_matches/results.total_tests*100:.1f}% | 1.0 |\n")
        f.write(f"| 🟢 Same Category | {results.same_category_matches} | {results.same_category_matches/results.total_tests*100:.1f}% | 0.8 |\n")
        f.write(f"| 🟡 Adjacent | {results.adjacent_matches} | {results.adjacent_matches/results.total_tests*100:.1f}% | 0.5 |\n")
        f.write(f"| 🟠 Distant | {results.distant_matches} | {results.distant_matches/results.total_tests*100:.1f}% | 0.2 |\n")
        f.write(f"| ❌ Opposite | {results.opposite_matches} | {results.opposite_matches/results.total_tests*100:.1f}% | 0.0 |\n\n")
        f.write(benchmark.generate_emotion_detail_table(results))
    
    print(f"\n📄 Detailed report saved to: {detail_path}")
    
    return results


if __name__ == "__main__":
    run_wheel_evaluation()