Spaces:
Running
Running
| import copy as cp | |
| import json | |
| from collections import defaultdict | |
| from urllib.request import urlopen | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from decimal import Decimal, ROUND_HALF_UP | |
| from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS | |
| def listinstr(lst, s): | |
| assert isinstance(lst, list) | |
| for item in lst: | |
| if item in s: | |
| return True | |
| return False | |
| def load_results(file_name=OVERALL_MATH_SCORE_FILE): | |
| data = json.loads(open(file_name, "r").read()) | |
| return data | |
| def format_timestamp(timestamp): | |
| date = timestamp[:10] | |
| time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19] | |
| return date + ' ' + time | |
| def nth_large(val, vals): | |
| return sum([1 for v in vals if v > val]) + 1 | |
| def BUILD_L1_DF(results, fields): | |
| check_box = {} | |
| check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date'] | |
| # First check which columns exist in the actual data structure | |
| sample_data = next(iter(results.values())) | |
| available_fields = [] | |
| for field in fields: | |
| if field in sample_data: | |
| available_fields.append(field) | |
| # Build column names, ensure they match exactly with those in generate_table function | |
| score_columns = [f"{field}-Score" for field in available_fields] | |
| cost_columns = [f"{field}-Cost($)" for field in available_fields] | |
| combined_columns = score_columns + cost_columns | |
| combined_columns_sorted = sorted(combined_columns, key=lambda x: x.split('-')[0]) | |
| check_box['required'] = ['Avg Score'] + combined_columns_sorted | |
| check_box['all'] = ['Avg Score'] + combined_columns_sorted | |
| type_map = defaultdict(lambda: 'number') | |
| type_map['Algorithm'] = 'html' | |
| type_map['LLM'] = type_map['Vision Model'] = 'html' | |
| type_map['Eval Date'] = 'str' | |
| type_map['Avg Score'] = 'number' | |
| type_map['gsm8k-Score'] = 'number' | |
| type_map['AQuA-Score'] = 'number' | |
| type_map['gsm8k-Cost($)'] = 'number' | |
| type_map['AQuA-Cost($)'] = 'number' | |
| check_box['type_map'] = type_map | |
| return check_box | |
| def BUILD_L2_DF(results, fields): | |
| res = defaultdict(list) | |
| # Iterate over each algorithm and its corresponding models | |
| for algo_name, algo_data in results.items(): | |
| for model_name, model_data in algo_data.items(): | |
| # Get META information | |
| meta = model_data['META'] | |
| # Create a record for each dataset | |
| for dataset in fields: | |
| if dataset not in model_data: | |
| continue | |
| # Add metadata | |
| for k, v in meta.items(): | |
| res[k].append(v) | |
| # Add dataset name | |
| res['Dataset'].append(dataset) | |
| # Get dataset data | |
| dataset_data = model_data[dataset] | |
| # Add all fields | |
| for field, value in dataset_data.items(): | |
| res[field].append(value) | |
| # Create DataFrame | |
| df = pd.DataFrame(res) | |
| # Get all unique Algorithms and LLM | |
| unique_algorithms = df['Algorithm'].unique().tolist() | |
| unique_llms = df['LLM'].unique().tolist() | |
| # Set checkbox configuration | |
| check_box = {} | |
| check_box['Algorithm_options'] = unique_algorithms # Add Algorithm Options | |
| check_box['LLM_options'] = unique_llms # Add LLM option | |
| # Sort by Dataset and Score in descending order | |
| df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
| # Add rank for each dataset separately | |
| df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
| # Rearrange column order | |
| columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] | |
| remaining_columns = [col for col in df.columns if col not in columns] | |
| df = df[columns + remaining_columns] | |
| # Set checkbox configuration | |
| check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'X-shot', 'Eval Date'] | |
| check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'Samples', 'All tokens', 'Cost($)'] | |
| check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)'] | |
| type_map = defaultdict(lambda: 'number') | |
| type_map['Algorithm'] = 'html' | |
| type_map['LLM'] = type_map['Vision Model'] = 'html' | |
| type_map['Eval Date'] = 'str' | |
| type_map['Dataset'] = 'str' | |
| type_map['All tokens'] = 'number' | |
| type_map['Cost($)'] = 'number' | |
| check_box['type_map'] = type_map | |
| return df, check_box | |
| def generate_table(results, fields): | |
| res = defaultdict(list) | |
| for i, m in enumerate(results): | |
| item = results[m] | |
| meta = item['META'] | |
| for k in META_FIELDS: | |
| res[k].append(meta[k]) | |
| scores, costs = [], [] | |
| # Ensure column names format matches with BUILD_L1_DF | |
| for d in fields: | |
| if d in item: | |
| score = item[d].get("Score") | |
| cost = item[d].get("Cost($)") | |
| res[f"{d}-Score"].append(score) | |
| res[f"{d}-Cost($)"].append(cost) | |
| if score is not None: | |
| scores.append(score) | |
| if cost is not None: | |
| costs.append(cost) | |
| else: | |
| res[f"{d}-Score"].append(None) | |
| res[f"{d}-Cost($)"].append(None) | |
| # Calculate average score | |
| if scores: | |
| decimal_numbers = [Decimal(str(num)) for num in scores] | |
| avg_score = Decimal(str(np.mean(scores) if scores else None)) | |
| avg_score = sum(decimal_numbers) / len(decimal_numbers) | |
| else: | |
| avg_score = None | |
| formatted_average = avg_score.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP) | |
| res['Avg Score'].append(formatted_average) | |
| df = pd.DataFrame(res) | |
| # Sorting and ranking logic remains unchanged | |
| valid = df[~pd.isna(df['Avg Score'])].copy() | |
| missing = df[pd.isna(df['Avg Score'])].copy() | |
| valid = valid.sort_values('Avg Score', ascending=False) | |
| valid['Rank'] = range(1, len(valid) + 1) | |
| if not missing.empty: | |
| missing['Rank'] = len(valid) + 1 | |
| df = pd.concat([valid, missing]) | |
| df = df.sort_values('Rank') | |
| # Rearrange column order | |
| columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] | |
| for d in fields: | |
| columns.extend([f"{d}-Score", f"{d}-Cost($)"]) | |
| existing_columns = [col for col in columns if col in df.columns] | |
| df = df[existing_columns] | |
| return df | |
| def generate_table_detail(results, fields): | |
| res = defaultdict(list) | |
| # Iterate over each algorithm and its corresponding models | |
| for algo_name, algo_data in results.items(): | |
| for model_name, model_data in algo_data.items(): | |
| # Get META information | |
| meta = model_data['META'] | |
| # Create a record for each dataset | |
| for dataset in fields: | |
| if dataset not in model_data: | |
| continue | |
| # Add metadata | |
| for k, v in meta.items(): | |
| res[k].append(v) | |
| # Add dataset name | |
| res['Dataset'].append(dataset) | |
| # Get dataset data | |
| dataset_data = model_data[dataset] | |
| # Add all fields | |
| for field, value in dataset_data.items(): | |
| res[field].append(value) | |
| # Create DataFrame | |
| df = pd.DataFrame(res) | |
| # Sort by Dataset and Score in descending order | |
| df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
| # Add rank for each dataset separately | |
| df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
| # Rearrange column order | |
| columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot'] | |
| remaining_columns = [col for col in df.columns if col not in columns] | |
| df = df[columns + remaining_columns] | |
| return df | |
| def BUILD_L3_DF(results, fields): | |
| res = defaultdict(list) | |
| # Iterate over each entry in the multi-modal results (results is a list) | |
| for entry in results: | |
| # Add Agent and VLMs | |
| res['Agent'].append(entry.get('Agent', 'Unknown')) | |
| res['VLMs'].append(entry.get('VLMs', 'Unknown')) | |
| # Add numeric fields | |
| for field in fields: | |
| res[field].append(entry.get(field, None)) | |
| # Create DataFrame | |
| df = pd.DataFrame(res) | |
| # Remove duplicate rows based on 'Agent' and 'VLMs' | |
| df = df.drop_duplicates(subset=['Agent', 'VLMs'], keep='first') | |
| # Ensure unique values for 'Agent' and 'VLMs' | |
| df['Agent'] = df['Agent'].str.strip() | |
| df['VLMs'] = df['VLMs'].str.strip() | |
| # Sort by Score in descending order | |
| df = df.sort_values('Score', ascending=False) | |
| # Add Rank column | |
| df['Rank'] = range(1, len(df) + 1) | |
| # Rearrange column order | |
| columns = ['Rank', 'Agent', 'VLMs'] + fields | |
| df = df[columns] | |
| # Set checkbox configuration | |
| check_box = {} | |
| check_box['essential'] = ['Agent', 'VLMs'] | |
| check_box['required'] = check_box['essential'] + fields | |
| check_box['all'] = ['Rank'] + fields | |
| type_map = defaultdict(lambda: 'number') | |
| type_map['Agent'] = 'str' | |
| type_map['VLMs'] = 'str' | |
| type_map['Rank'] = 'number' | |
| for field in fields: | |
| type_map[field] = 'number' | |
| check_box['type_map'] = type_map | |
| return df, check_box |