# -*- coding: utf-8 -*- """ Wheel-Based Accuracy Benchmark - Similarity-aware emotion evaluation Features: - Uses Emotion Wheel for similarity scoring - Exact match = 1.0, Same category = 0.8, Adjacent = 0.5, Opposite = 0.0 - Shows detected emotion for each test - Calculates weighted accuracy based on similarity - Identifies contradictory detections vs similar mistakes Version: 3.0.0 """ import sys import os import time from typing import Dict, List, Tuple, Any, Optional from dataclasses import dataclass, field from collections import defaultdict from datetime import datetime sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from avatar.emotion_wheel import EmotionWheel, get_emotion_wheel, EmotionCategory @dataclass class WheelTestResult: """Result for a single test with wheel-based scoring""" emotion: str text: str expected_category: str detected_emotion: str detected_category: Optional[str] similarity_score: float relationship: str # "exact", "same_category", "adjacent", "distant", "opposite" inference_time_ms: float emoji: str @dataclass class WheelBenchmarkResults: """Aggregated wheel-based benchmark results""" total_tests: int = 0 # Weighted accuracy (using similarity scores) weighted_accuracy: float = 0.0 # Traditional accuracy exact_matches: int = 0 exact_accuracy: float = 0.0 # Similarity-based metrics same_category_matches: int = 0 adjacent_matches: int = 0 distant_matches: int = 0 opposite_matches: int = 0 # These are the real failures # Acceptable = exact + same_category + adjacent acceptable_accuracy: float = 0.0 # Contradiction rate (opposite emotions) contradiction_rate: float = 0.0 # Per-emotion results emotion_results: Dict[str, List[WheelTestResult]] = field(default_factory=dict) emotion_scores: Dict[str, float] = field(default_factory=dict) # Category performance category_accuracy: Dict[str, float] = field(default_factory=dict) # Timing avg_inference_time_ms: float = 0.0 # Failed emotions (>50% opposite) failed_emotions: List[str] = field(default_factory=list) # Confusion matrix by category category_confusion: Dict[str, Dict[str, int]] = field(default_factory=dict) class WheelBenchmark: """ Wheel-based emotion benchmark with similarity scoring Uses EmotionWheel to determine: - Exact matches (same emotion) - Same category (similar emotions) - Adjacent category (related emotions) - Opposite category (contradictory emotions - failures) """ def __init__(self, analyzer, emoji_mapper): """ Initialize benchmark Args: analyzer: Sentiment analyzer instance emoji_mapper: EmojiMapper instance """ self.analyzer = analyzer self.mapper = emoji_mapper self.wheel = get_emotion_wheel() def _extract_emotion(self, result: Dict[str, Any]) -> str: """Extract emotion label from analyzer result""" # Try different possible keys for key in ["label", "emotion", "detected_emotion"]: if key in result: return result[key].lower() return "neutral" def run_single_test(self, text: str, expected_emotion: str, expected_category: str) -> WheelTestResult: """Run single test with wheel-based scoring""" start_time = time.perf_counter() result = self.analyzer.analyze(text) end_time = time.perf_counter() inference_time_ms = (end_time - start_time) * 1000 detected_emotion = self._extract_emotion(result) detected_category = self.wheel.get_category(detected_emotion) detected_cat_name = detected_category.value if detected_category else "unknown" # Get similarity score and relationship score, relationship = self.wheel.get_similarity_score(expected_emotion, detected_emotion) # Get emoji emoji = self.mapper.get_emoji(detected_emotion) return WheelTestResult( emotion=expected_emotion, text=text, expected_category=expected_category, detected_emotion=detected_emotion, detected_category=detected_cat_name, similarity_score=score, relationship=relationship, inference_time_ms=inference_time_ms, emoji=emoji ) def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> WheelBenchmarkResults: """ Run full benchmark with wheel-based scoring Args: test_data: Dict mapping emotion -> [(text, category), ...] Returns: WheelBenchmarkResults with similarity-aware metrics """ results = WheelBenchmarkResults() results.emotion_results = defaultdict(list) results.category_confusion = defaultdict(lambda: defaultdict(int)) all_times = [] all_scores = [] for emotion, test_cases in test_data.items(): emotion_scores = [] for text, expected_category in test_cases: test_result = self.run_single_test(text, emotion, expected_category) results.emotion_results[emotion].append(test_result) results.total_tests += 1 all_times.append(test_result.inference_time_ms) all_scores.append(test_result.similarity_score) emotion_scores.append(test_result.similarity_score) # Count by relationship type if test_result.relationship == "exact": results.exact_matches += 1 elif test_result.relationship == "same_category": results.same_category_matches += 1 elif test_result.relationship == "adjacent": results.adjacent_matches += 1 elif test_result.relationship == "opposite": results.opposite_matches += 1 else: results.distant_matches += 1 # Update category confusion matrix results.category_confusion[expected_category][test_result.detected_category] += 1 # Calculate per-emotion score if emotion_scores: avg_score = sum(emotion_scores) / len(emotion_scores) results.emotion_scores[emotion] = avg_score # Check if emotion failed (majority opposite) opposite_count = sum(1 for r in results.emotion_results[emotion] if r.relationship == "opposite") if opposite_count > len(emotion_scores) / 2: results.failed_emotions.append(emotion) # Calculate overall metrics if results.total_tests > 0: results.weighted_accuracy = sum(all_scores) / len(all_scores) results.exact_accuracy = results.exact_matches / results.total_tests acceptable = results.exact_matches + results.same_category_matches + results.adjacent_matches results.acceptable_accuracy = acceptable / results.total_tests results.contradiction_rate = results.opposite_matches / results.total_tests # Calculate category accuracy for category in set(tc[1] for cases in test_data.values() for tc in cases): category_tests = [ r for emotion_results in results.emotion_results.values() for r in emotion_results if r.expected_category == category ] if category_tests: category_score = sum(r.similarity_score for r in category_tests) / len(category_tests) results.category_accuracy[category] = category_score # Calculate timing if all_times: results.avg_inference_time_ms = sum(all_times) / len(all_times) return results def generate_detailed_report(self, results: WheelBenchmarkResults) -> str: """Generate detailed report showing detected vs expected""" lines = [ "╔══════════════════════════════════════════════════════════════════════════════╗", "║ WHEEL-BASED EMOTION RECOGNITION EVALUATION REPORT ║", "╠══════════════════════════════════════════════════════════════════════════════╣", f"║ Total Tests: {results.total_tests:<10} ║", f"║ Weighted Accuracy: {results.weighted_accuracy:.1%} ║", f"║ Exact Match Rate: {results.exact_accuracy:.1%} ║", f"║ Acceptable Rate: {results.acceptable_accuracy:.1%} (exact + similar + adjacent) ║", f"║ Contradiction Rate: {results.contradiction_rate:.1%} (opposite emotions) ║", "╚══════════════════════════════════════════════════════════════════════════════╝", "", "┌──────────────────────────────────────────────────────────────────────────────┐", "│ SCORING BREAKDOWN │", "├──────────────────────────────────────────────────────────────────────────────┤", f"│ ✅ Exact Matches: {results.exact_matches:4} ({results.exact_matches/results.total_tests*100:5.1f}%) - Score: 1.0 │", f"│ 🟢 Same Category: {results.same_category_matches:4} ({results.same_category_matches/results.total_tests*100:5.1f}%) - Score: 0.8 │", f"│ 🟡 Adjacent Category: {results.adjacent_matches:4} ({results.adjacent_matches/results.total_tests*100:5.1f}%) - Score: 0.5 │", f"│ 🟠 Distant Category: {results.distant_matches:4} ({results.distant_matches/results.total_tests*100:5.1f}%) - Score: 0.2 │", f"│ ❌ Opposite (FAIL): {results.opposite_matches:4} ({results.opposite_matches/results.total_tests*100:5.1f}%) - Score: 0.0 │", "└──────────────────────────────────────────────────────────────────────────────┘", "", ] # Category performance lines.extend([ "┌──────────────────────────────────────────────────────────────────────────────┐", "│ CATEGORY PERFORMANCE │", "├──────────────────────────────────────────────────────────────────────────────┤", ]) for category, score in sorted(results.category_accuracy.items(), key=lambda x: -x[1]): bar = "█" * int(score * 30) + "░" * (30 - int(score * 30)) lines.append(f"│ {category:<15} {bar} {score:.1%} │") lines.append("└──────────────────────────────────────────────────────────────────────────────┘") lines.append("") # Show some example results for each category lines.extend([ "┌──────────────────────────────────────────────────────────────────────────────┐", "│ SAMPLE RESULTS (Expected → Detected) │", "├──────────────────────────────────────────────────────────────────────────────┤", ]) # Show a few examples from each relationship type examples = {"exact": [], "same_category": [], "adjacent": [], "opposite": []} for emotion_results in results.emotion_results.values(): for r in emotion_results: if r.relationship in examples and len(examples[r.relationship]) < 3: examples[r.relationship].append(r) for rel_type, emoji_prefix in [("exact", "✅"), ("same_category", "🟢"), ("adjacent", "🟡"), ("opposite", "❌")]: if examples[rel_type]: lines.append(f"│ {emoji_prefix} {rel_type.upper()}: │") for ex in examples[rel_type]: text_short = ex.text[:35] + "..." if len(ex.text) > 35 else ex.text lines.append(f"│ '{text_short:<38}' │") lines.append(f"│ Expected: {ex.emotion:<12} → Detected: {ex.detected_emotion:<12} {ex.emoji} │") lines.append("└──────────────────────────────────────────────────────────────────────────────┘") # Failed emotions if results.failed_emotions: lines.extend([ "", "┌──────────────────────────────────────────────────────────────────────────────┐", "│ ⚠️ FAILED EMOTIONS (>50% contradictory detections) │", "├──────────────────────────────────────────────────────────────────────────────┤", ]) for em in results.failed_emotions[:10]: score = results.emotion_scores.get(em, 0) lines.append(f"│ ❌ {em:<25} Score: {score:.2f} │") lines.append("└──────────────────────────────────────────────────────────────────────────────┘") return "\n".join(lines) def generate_emotion_detail_table(self, results: WheelBenchmarkResults) -> str: """Generate detailed table for all emotions""" lines = [ "# Emotion Recognition Detail Report", "", "| Emotion | Score | Exact | Similar | Adjacent | Opposite | Status |", "|---------|-------|-------|---------|----------|----------|--------|", ] for emotion in sorted(results.emotion_scores.keys()): score = results.emotion_scores[emotion] emotion_tests = results.emotion_results[emotion] exact = sum(1 for r in emotion_tests if r.relationship == "exact") same = sum(1 for r in emotion_tests if r.relationship == "same_category") adj = sum(1 for r in emotion_tests if r.relationship == "adjacent") opp = sum(1 for r in emotion_tests if r.relationship == "opposite") total = len(emotion_tests) if score >= 0.8: status = "✅ PASS" elif score >= 0.5: status = "⚠️ OK" else: status = "❌ FAIL" lines.append( f"| {emotion:<15} | {score:.2f} | " f"{exact}/{total} | {same}/{total} | {adj}/{total} | {opp}/{total} | {status} |" ) return "\n".join(lines) def run_wheel_evaluation(): """Run the wheel-based evaluation""" print("=" * 80) print("WHEEL-BASED EMOTION RECOGNITION EVALUATION") print("=" * 80) print() # Import components from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer from avatar.sentiment_emoji_map import EmojiMapper from evaluation.emotion_test_suite_v3 import EmotionTestSuiteV3 # Initialize print("Loading components...") analyzer = MultiEmotionAnalyzer() mapper = EmojiMapper() suite = EmotionTestSuiteV3() benchmark = WheelBenchmark(analyzer, mapper) print(f"Test Suite V{suite.VERSION}") print(f"Emotions: {suite.get_emotion_count()}, Tests: {suite.get_test_count()}") print() # Run benchmark print("Running benchmark...") start = time.time() results = benchmark.run_benchmark(suite.EMOTION_TEST_DATA) elapsed = time.time() - start print(f"Completed in {elapsed:.2f}s") print() # Show report print(benchmark.generate_detailed_report(results)) # Save detailed markdown report timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") report_dir = "evaluation/reports" os.makedirs(report_dir, exist_ok=True) # Save emotion detail table detail_path = os.path.join(report_dir, f"wheel_evaluation_{timestamp}.md") with open(detail_path, "w", encoding="utf-8") as f: f.write(f"# Wheel-Based Emotion Evaluation Report\n\n") f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write(f"## Summary\n\n") f.write(f"- **Total Tests:** {results.total_tests}\n") f.write(f"- **Weighted Accuracy:** {results.weighted_accuracy:.1%}\n") f.write(f"- **Exact Match Rate:** {results.exact_accuracy:.1%}\n") f.write(f"- **Acceptable Rate:** {results.acceptable_accuracy:.1%}\n") f.write(f"- **Contradiction Rate:** {results.contradiction_rate:.1%}\n\n") f.write(f"## Scoring Breakdown\n\n") f.write(f"| Relationship | Count | Percentage | Score |\n") f.write(f"|--------------|-------|------------|-------|\n") f.write(f"| ✅ Exact | {results.exact_matches} | {results.exact_matches/results.total_tests*100:.1f}% | 1.0 |\n") f.write(f"| 🟢 Same Category | {results.same_category_matches} | {results.same_category_matches/results.total_tests*100:.1f}% | 0.8 |\n") f.write(f"| 🟡 Adjacent | {results.adjacent_matches} | {results.adjacent_matches/results.total_tests*100:.1f}% | 0.5 |\n") f.write(f"| 🟠 Distant | {results.distant_matches} | {results.distant_matches/results.total_tests*100:.1f}% | 0.2 |\n") f.write(f"| ❌ Opposite | {results.opposite_matches} | {results.opposite_matches/results.total_tests*100:.1f}% | 0.0 |\n\n") f.write(benchmark.generate_emotion_detail_table(results)) print(f"\n📄 Detailed report saved to: {detail_path}") return results if __name__ == "__main__": run_wheel_evaluation()