translation set updated
Browse files
app.py
CHANGED
|
@@ -15,7 +15,13 @@ Evaluating the chat, safety, reasoning, and translation capabilities of Multilin
|
|
| 15 |
|
| 16 |
π https://m-rewardbench.github.io/'''
|
| 17 |
|
| 18 |
-
GOOGLE_SHEET_URL = "https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/export?gid=0&format=csv"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# ABOUT_TEXT = """
|
| 20 |
# <h1>
|
| 21 |
# <span style="font-variant: small-caps;">M-RewardBench</span>: Evaluating Reward Models in Multilingual Settings
|
|
@@ -51,11 +57,41 @@ class AutoEvalColumn:
|
|
| 51 |
})
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def get_result_data():
|
| 55 |
-
return pd.read_csv(
|
|
|
|
| 56 |
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
|
|
|
|
| 59 |
if dataframe is None or dataframe.empty:
|
| 60 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 61 |
|
|
@@ -63,18 +99,18 @@ def init_leaderboard(dataframe):
|
|
| 63 |
value=dataframe,
|
| 64 |
datatype=[
|
| 65 |
col["type"]
|
| 66 |
-
for col in
|
| 67 |
if isinstance(col, dict)
|
| 68 |
],
|
| 69 |
select_columns=SelectColumns(
|
| 70 |
default_selection=[
|
| 71 |
col["name"]
|
| 72 |
-
for col in
|
| 73 |
if isinstance(col, dict) and col["displayed_by_default"]
|
| 74 |
],
|
| 75 |
cant_deselect=[
|
| 76 |
col["name"]
|
| 77 |
-
for col in
|
| 78 |
if isinstance(col, dict) and col.get("never_hidden", False)
|
| 79 |
],
|
| 80 |
label="Select Columns to Display:",
|
|
@@ -99,7 +135,7 @@ emojis = "π’ π¬ π―"
|
|
| 99 |
model_types = {"Generative RM": "π¬", "DPO": "π―", "Sequence Classifier": "π’"}
|
| 100 |
|
| 101 |
from functools import partial
|
| 102 |
-
def format_with_color(val, min_val=50, max_val=100):
|
| 103 |
"""
|
| 104 |
Formats a value with inline green color gradient CSS.
|
| 105 |
Returns an HTML string with bold, black text and muted green background.
|
|
@@ -111,6 +147,7 @@ def format_with_color(val, min_val=50, max_val=100):
|
|
| 111 |
|
| 112 |
# Normalize value between 50 and 100 to 0-1 range
|
| 113 |
normalized = (val - min_val) / (max_val - min_val)
|
|
|
|
| 114 |
# Clamp value between 0 and 1
|
| 115 |
normalized = max(0, min(1, normalized))
|
| 116 |
|
|
@@ -119,7 +156,12 @@ def format_with_color(val, min_val=50, max_val=100):
|
|
| 119 |
intensity = int(50 + (150 * (1 - normalized)))
|
| 120 |
|
| 121 |
# Return HTML with inline CSS - bold black text
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
except (ValueError, TypeError):
|
| 125 |
return str(val)
|
|
@@ -131,12 +173,10 @@ with demo:
|
|
| 131 |
gr.Markdown(INTRODUCTION_TEXT)
|
| 132 |
|
| 133 |
with gr.Tabs() as tabs:
|
| 134 |
-
with gr.TabItem("π
|
| 135 |
df = get_result_data()
|
| 136 |
df["Model_Type"] = df["Model_Type"].map(model_types)
|
| 137 |
-
|
| 138 |
df["Model"] = df.apply(format_model_link, axis=1)
|
| 139 |
-
|
| 140 |
df["zho"] = df[["zho_Hans", "zho_Hant"]].mean(axis=1)
|
| 141 |
|
| 142 |
columns = lang_ids.split("\t")
|
|
@@ -152,22 +192,63 @@ with demo:
|
|
| 152 |
|
| 153 |
# df = df.style.applymap(apply_color_gradient, subset=['eng'])
|
| 154 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
|
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
for col in numeric_cols:
|
| 158 |
lang_format_with_color = partial(format_with_color,
|
| 159 |
-
min_val=df[col].min(),
|
| 160 |
-
max_val=df[col].max()
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
df[col] = df[col].apply(lang_format_with_color)
|
| 163 |
|
| 164 |
-
|
| 165 |
# for col in numeric_cols:
|
| 166 |
# df[col] = (df[col] * 100).round(1).astype(str)
|
| 167 |
|
| 168 |
AutoEvalColumn.add_columns_from_df(df, numeric_cols)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
with gr.Row():
|
| 173 |
with gr.Accordion("π Citation", open=False):
|
|
|
|
| 15 |
|
| 16 |
π https://m-rewardbench.github.io/'''
|
| 17 |
|
| 18 |
+
# GOOGLE_SHEET_URL = "https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/export?gid=0&format=csv"
|
| 19 |
+
|
| 20 |
+
GOOGLE_SHEET_URLS = [
|
| 21 |
+
"https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/gviz/tq?tqx=out:csv&sheet=gt",
|
| 22 |
+
"https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/gviz/tq?tqx=out:csv&sheet=maple"
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
# ABOUT_TEXT = """
|
| 26 |
# <h1>
|
| 27 |
# <span style="font-variant: small-caps;">M-RewardBench</span>: Evaluating Reward Models in Multilingual Settings
|
|
|
|
| 57 |
})
|
| 58 |
|
| 59 |
|
| 60 |
+
class AutoEvalColumnTranslation:
|
| 61 |
+
model = {
|
| 62 |
+
"name": "Model",
|
| 63 |
+
"type": "markdown",
|
| 64 |
+
"displayed_by_default": True,
|
| 65 |
+
"never_hidden": True,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
model_type = {
|
| 69 |
+
"name": "MT",
|
| 70 |
+
"type": "markdown",
|
| 71 |
+
"displayed_by_default": True,
|
| 72 |
+
"never_hidden": True,
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
@classmethod
|
| 76 |
+
def add_columns_from_df(cls, df, columns):
|
| 77 |
+
for col in columns:
|
| 78 |
+
if col.lower() != 'model': # Skip if it's the model column since it's predefined
|
| 79 |
+
setattr(cls, col, {
|
| 80 |
+
"name": col,
|
| 81 |
+
"type": "markdown",
|
| 82 |
+
"displayed_by_default": True,
|
| 83 |
+
"never_hidden": False,
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
def get_result_data():
|
| 87 |
+
return pd.read_csv(GOOGLE_SHEET_URLS[0])
|
| 88 |
+
|
| 89 |
|
| 90 |
+
def get_translation_data():
|
| 91 |
+
return pd.read_csv(GOOGLE_SHEET_URLS[1])
|
| 92 |
|
| 93 |
+
|
| 94 |
+
def init_leaderboard(dataframe, df_class):
|
| 95 |
if dataframe is None or dataframe.empty:
|
| 96 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 97 |
|
|
|
|
| 99 |
value=dataframe,
|
| 100 |
datatype=[
|
| 101 |
col["type"]
|
| 102 |
+
for col in df_class.__dict__.values()
|
| 103 |
if isinstance(col, dict)
|
| 104 |
],
|
| 105 |
select_columns=SelectColumns(
|
| 106 |
default_selection=[
|
| 107 |
col["name"]
|
| 108 |
+
for col in df_class.__dict__.values()
|
| 109 |
if isinstance(col, dict) and col["displayed_by_default"]
|
| 110 |
],
|
| 111 |
cant_deselect=[
|
| 112 |
col["name"]
|
| 113 |
+
for col in df_class.__dict__.values()
|
| 114 |
if isinstance(col, dict) and col.get("never_hidden", False)
|
| 115 |
],
|
| 116 |
label="Select Columns to Display:",
|
|
|
|
| 135 |
model_types = {"Generative RM": "π¬", "DPO": "π―", "Sequence Classifier": "π’"}
|
| 136 |
|
| 137 |
from functools import partial
|
| 138 |
+
def format_with_color(val, min_val=50, max_val=100, scale=True):
|
| 139 |
"""
|
| 140 |
Formats a value with inline green color gradient CSS.
|
| 141 |
Returns an HTML string with bold, black text and muted green background.
|
|
|
|
| 147 |
|
| 148 |
# Normalize value between 50 and 100 to 0-1 range
|
| 149 |
normalized = (val - min_val) / (max_val - min_val)
|
| 150 |
+
# print(normalized)
|
| 151 |
# Clamp value between 0 and 1
|
| 152 |
normalized = max(0, min(1, normalized))
|
| 153 |
|
|
|
|
| 156 |
intensity = int(50 + (150 * (1 - normalized)))
|
| 157 |
|
| 158 |
# Return HTML with inline CSS - bold black text
|
| 159 |
+
show_val = val
|
| 160 |
+
|
| 161 |
+
if scale:
|
| 162 |
+
show_val = val*100
|
| 163 |
+
|
| 164 |
+
return f'<div val={val} style="background-color: rgb({intensity}, 200, {intensity}); color: black; font-weight: bold; text-align: center; vertical-align: middle;">{show_val:.1f}</div>'
|
| 165 |
|
| 166 |
except (ValueError, TypeError):
|
| 167 |
return str(val)
|
|
|
|
| 173 |
gr.Markdown(INTRODUCTION_TEXT)
|
| 174 |
|
| 175 |
with gr.Tabs() as tabs:
|
| 176 |
+
with gr.TabItem("π
Main"):
|
| 177 |
df = get_result_data()
|
| 178 |
df["Model_Type"] = df["Model_Type"].map(model_types)
|
|
|
|
| 179 |
df["Model"] = df.apply(format_model_link, axis=1)
|
|
|
|
| 180 |
df["zho"] = df[["zho_Hans", "zho_Hant"]].mean(axis=1)
|
| 181 |
|
| 182 |
columns = lang_ids.split("\t")
|
|
|
|
| 192 |
|
| 193 |
# df = df.style.applymap(apply_color_gradient, subset=['eng'])
|
| 194 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 195 |
+
global_min = df.select_dtypes(include='number').min().min().astype(float)
|
| 196 |
+
global_max = df.select_dtypes(include='number').max().max().astype(float)
|
| 197 |
|
| 198 |
|
| 199 |
for col in numeric_cols:
|
| 200 |
lang_format_with_color = partial(format_with_color,
|
| 201 |
+
# min_val=df[col].min(),
|
| 202 |
+
# max_val=df[col].max(),
|
| 203 |
+
min_val=global_min,
|
| 204 |
+
max_val=global_max,
|
| 205 |
+
)
|
| 206 |
|
| 207 |
df[col] = df[col].apply(lang_format_with_color)
|
| 208 |
|
|
|
|
| 209 |
# for col in numeric_cols:
|
| 210 |
# df[col] = (df[col] * 100).round(1).astype(str)
|
| 211 |
|
| 212 |
AutoEvalColumn.add_columns_from_df(df, numeric_cols)
|
| 213 |
+
leaderboard = init_leaderboard(df, AutoEvalColumn)
|
| 214 |
+
|
| 215 |
+
with gr.TabItem("π
Translation"):
|
| 216 |
+
df = get_translation_data()
|
| 217 |
+
df["Model_Type"] = df["Model_Type"].map(model_types)
|
| 218 |
+
df["Model"] = df.apply(format_model_link, axis=1)
|
| 219 |
+
|
| 220 |
+
df.rename(columns={
|
| 221 |
+
"Model_Type": "MT",
|
| 222 |
+
"Avg": "AVG",
|
| 223 |
+
}, inplace=True)
|
| 224 |
+
|
| 225 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 226 |
+
# print(df[numeric_cols].min().min())
|
| 227 |
+
# print(df[numeric_cols].max().max())
|
| 228 |
+
global_min = df.select_dtypes(include='number').min().min().astype(float)
|
| 229 |
+
global_max = df.select_dtypes(include='number').max().max().astype(float)
|
| 230 |
+
# print(global_max)
|
| 231 |
+
|
| 232 |
+
for col in numeric_cols:
|
| 233 |
+
# print(df[col].min())
|
| 234 |
+
lang_format_with_color = partial(format_with_color,
|
| 235 |
+
min_val=global_min,
|
| 236 |
+
max_val=global_max,
|
| 237 |
+
# min_val=df[col].min(),
|
| 238 |
+
# max_val=df[col].max(),
|
| 239 |
+
scale=False)
|
| 240 |
+
df[col] = df[col].apply(lang_format_with_color)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
|
| 244 |
+
|
| 245 |
+
# for col in numeric_cols:
|
| 246 |
+
# df[col] = (df[col] * 100).round(1).astype(str)
|
| 247 |
+
|
| 248 |
+
AutoEvalColumnTranslation.add_columns_from_df(df, numeric_cols)
|
| 249 |
+
leaderboard = init_leaderboard(df, AutoEvalColumnTranslation)
|
| 250 |
+
|
| 251 |
+
|
| 252 |
|
| 253 |
with gr.Row():
|
| 254 |
with gr.Accordion("π Citation", open=False):
|