Spaces:
Running
Running
| import copy as cp | |
| import json | |
| from collections import defaultdict | |
| from urllib.request import urlopen | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS | |
| def listinstr(lst, s): | |
| assert isinstance(lst, list) | |
| for item in lst: | |
| if item in s: | |
| return True | |
| return False | |
| def load_results(file_name=OVERALL_MATH_SCORE_FILE): | |
| data = json.loads(open(file_name, "r").read()) | |
| return data | |
| def format_timestamp(timestamp): | |
| date = timestamp[:10] | |
| time = timestamp[11:13] + ':' + timestamp[14:16] + ':' + timestamp[17:19] | |
| return date + ' ' + time | |
| def nth_large(val, vals): | |
| return sum([1 for v in vals if v > val]) + 1 | |
| def BUILD_L1_DF(results, fields): | |
| check_box = {} | |
| check_box['essential'] = ['Algorithm', 'LLM', 'Eval Date'] | |
| # revise there to set default dataset | |
| check_box['required'] = ['Avg Score'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')] | |
| check_box['avg'] = ['Avg Score'] | |
| check_box['all'] = check_box['avg'] + [item for f in fields for item in (f'{f}-Score', f'{f}-Cost($)')] | |
| type_map = defaultdict(lambda: 'number') | |
| type_map['Algorithm'] = 'html' | |
| type_map['LLM'] = type_map['Vision Model'] = 'html' | |
| type_map['Eval Date'] = 'str' | |
| check_box['type_map'] = type_map | |
| # df = generate_table(results, fields) | |
| return check_box | |
| def BUILD_L2_DF(results, fields): | |
| res = defaultdict(list) | |
| # Iterate over each algorithm and its corresponding models | |
| for algo_name, algo_data in results.items(): | |
| for model_name, model_data in algo_data.items(): | |
| # Get META information | |
| meta = model_data['META'] | |
| # Create a record for each dataset | |
| for dataset in fields: | |
| if dataset not in model_data: | |
| continue | |
| # Add metadata | |
| for k, v in meta.items(): | |
| res[k].append(v) | |
| # Add dataset name | |
| res['Dataset'].append(dataset) | |
| # Get dataset data | |
| dataset_data = model_data[dataset] | |
| # Add all fields | |
| for field, value in dataset_data.items(): | |
| res[field].append(value) | |
| # Create DataFrame | |
| df = pd.DataFrame(res) | |
| # Sort by Dataset and Score in descending order | |
| df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
| # Add rank for each dataset separately | |
| df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
| # Rearrange column order | |
| columns = ['Rank', 'Algorithm', 'Dataset', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters'] | |
| remaining_columns = [col for col in df.columns if col not in columns] | |
| df = df[columns + remaining_columns] | |
| # Set checkbox configuration | |
| check_box = {} | |
| check_box['essential'] = ['Algorithm', 'Dataset', 'LLM', 'Eval Date'] | |
| check_box['required'] = check_box['essential'] + ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'All tokens', 'Cost($)'] | |
| check_box['all'] = ['Score', 'Pass rate', 'X-shot', 'Parameters', 'Samples', 'Total input tokens', 'Average input tokens', 'Total output tokens', 'Average output tokens', 'All tokens', 'Cost($)'] | |
| type_map = defaultdict(lambda: 'number') | |
| type_map['Algorithm'] = 'html' | |
| type_map['LLM'] = type_map['Vision Model'] = 'html' | |
| type_map['Eval Date'] = 'str' | |
| type_map['Dataset'] = 'str' | |
| type_map['Parameters'] = 'str' | |
| type_map['All tokens'] = 'number' | |
| type_map['Cost($)'] = 'number' | |
| check_box['type_map'] = type_map | |
| return df, check_box | |
| def generate_table(results, fields): | |
| res = defaultdict(list) | |
| for i, m in enumerate(results): | |
| item = results[m] | |
| meta = item['META'] | |
| for k in META_FIELDS: | |
| res[k].append(meta[k]) | |
| scores, costs = [], [] | |
| for d in fields: | |
| if d in item.keys(): | |
| res[d+"-Score"].append(item[d]["Score"]) | |
| res[d+"-Cost($)"].append(item[d]["Cost($)"]) | |
| scores.append(item[d]["Score"]) | |
| costs.append(item[d]["Cost($)"]) | |
| else: | |
| res[d+"-Score"].append(None) | |
| res[d+"-Cost($)"].append(None) | |
| scores.append(None) | |
| costs.append(None) | |
| res['Avg Score'].append(round(np.mean(scores), 2) if None not in scores else None) | |
| df = pd.DataFrame(res) | |
| # Sort by Avg Score and assign rank | |
| valid = df[~pd.isna(df['Avg Score'])].copy() | |
| missing = df[pd.isna(df['Avg Score'])].copy() | |
| # Assign rank to valid rows (using integer type) | |
| valid = valid.sort_values('Avg Score', ascending=False) | |
| valid['Rank'] = pd.Series(range(1, len(valid) + 1)[::-1], dtype=int) | |
| # Assign last rank to missing rows (using integer type) | |
| if not missing.empty: | |
| missing['Rank'] = pd.Series([len(valid) + 1] * len(missing), dtype=int) | |
| # Merge and sort by Rank | |
| df = pd.concat([valid, missing]) | |
| df = df.sort_values('Rank') | |
| # Rearrange column order to ensure Rank is the first column | |
| columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] # Fixed column order | |
| for d in fields: | |
| columns.extend([f"{d}-Score", f"{d}-Cost($)"]) # Add dataset-related columns | |
| # Ensure all columns exist and reorder | |
| existing_columns = [col for col in columns if col in df.columns] | |
| remaining_columns = [col for col in df.columns if col not in columns] | |
| df = df[existing_columns + remaining_columns] # Reorder columns | |
| # Sort by Score in descending order | |
| df = df.sort_values(['Avg Score'], ascending=[False]) | |
| # Add rank for each dataset separately | |
| df['Rank'] = range(1, len(df) + 1) | |
| # Rearrange column order | |
| columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score'] | |
| remaining_columns = [col for col in df.columns if col not in columns] | |
| df = df[columns + remaining_columns] | |
| return df | |
| def generate_table_detail(results, fields): | |
| res = defaultdict(list) | |
| # Iterate over each algorithm and its corresponding models | |
| for algo_name, algo_data in results.items(): | |
| for model_name, model_data in algo_data.items(): | |
| # Get META information | |
| meta = model_data['META'] | |
| # Create a record for each dataset | |
| for dataset in fields: | |
| if dataset not in model_data: | |
| continue | |
| # Add metadata | |
| for k, v in meta.items(): | |
| res[k].append(v) | |
| # Add dataset name | |
| res['Dataset'].append(dataset) | |
| # Get dataset data | |
| dataset_data = model_data[dataset] | |
| # Add all fields | |
| for field, value in dataset_data.items(): | |
| res[field].append(value) | |
| # Create DataFrame | |
| df = pd.DataFrame(res) | |
| # Sort by Dataset and Score in descending order | |
| df = df.sort_values(['Dataset', 'Score'], ascending=[True, False]) | |
| # Add rank for each dataset separately | |
| df['Rank'] = df.groupby('Dataset').cumcount() + 1 | |
| # Rearrange column order | |
| columns = ['Rank', 'Dataset', 'Algorithm', 'LLM', 'Eval Date', 'Score', 'Pass rate', 'X-shot', 'Parameters'] | |
| remaining_columns = [col for col in df.columns if col not in columns] | |
| df = df[columns + remaining_columns] | |
| return df |