Spaces:
Running
Running
improve pot implementation and score
Browse files- gen_table.py +9 -1
- src/detail_math_score.json +20 -20
- src/overall_math_score.json +6 -6
gen_table.py
CHANGED
|
@@ -7,6 +7,7 @@ import gradio as gr
|
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
|
|
|
|
| 10 |
from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
|
| 11 |
|
| 12 |
|
|
@@ -150,7 +151,14 @@ def generate_table(results, fields):
|
|
| 150 |
res[f"{d}-Cost($)"].append(None)
|
| 151 |
|
| 152 |
# Calculate average score
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
df = pd.DataFrame(res)
|
| 156 |
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
|
| 10 |
+
from decimal import Decimal, ROUND_HALF_UP
|
| 11 |
from meta_data import OVERALL_MATH_SCORE_FILE, DEFAULT_MATH_BENCH, META_FIELDS
|
| 12 |
|
| 13 |
|
|
|
|
| 151 |
res[f"{d}-Cost($)"].append(None)
|
| 152 |
|
| 153 |
# Calculate average score
|
| 154 |
+
if scores:
|
| 155 |
+
decimal_numbers = [Decimal(str(num)) for num in scores]
|
| 156 |
+
avg_score = Decimal(str(np.mean(scores) if scores else None))
|
| 157 |
+
avg_score = sum(decimal_numbers) / len(decimal_numbers)
|
| 158 |
+
else:
|
| 159 |
+
avg_score = None
|
| 160 |
+
formatted_average = avg_score.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP)
|
| 161 |
+
res['Avg Score'].append(formatted_average)
|
| 162 |
|
| 163 |
df = pd.DataFrame(res)
|
| 164 |
|
src/detail_math_score.json
CHANGED
|
@@ -226,17 +226,17 @@
|
|
| 226 |
"Cost($)": 0.6902
|
| 227 |
},
|
| 228 |
"AQuA": {
|
| 229 |
-
"Score":
|
| 230 |
-
"Pass rate":
|
| 231 |
"X-shot": 0,
|
| 232 |
"Parameters": "",
|
| 233 |
"Samples": 254,
|
| 234 |
-
"Total input tokens":
|
| 235 |
-
"Average input tokens":
|
| 236 |
-
"Total output tokens":
|
| 237 |
-
"Average output tokens":
|
| 238 |
-
"All tokens":
|
| 239 |
-
"Cost($)": 0.
|
| 240 |
}
|
| 241 |
},
|
| 242 |
"Doubao-lite-32k": {
|
|
@@ -246,30 +246,30 @@
|
|
| 246 |
"Eval Date": "2025/01/07"
|
| 247 |
},
|
| 248 |
"gsm8k": {
|
| 249 |
-
"Score": 79.
|
| 250 |
-
"Pass rate": 92.
|
| 251 |
"X-shot": 8,
|
| 252 |
"Parameters": "",
|
| 253 |
"Samples": 1319,
|
| 254 |
"Total input tokens": 1170038,
|
| 255 |
"Average input tokens": 887,
|
| 256 |
-
"Total output tokens":
|
| 257 |
"Average output tokens": 89,
|
| 258 |
-
"All tokens":
|
| 259 |
"Cost($)": 0.0575
|
| 260 |
},
|
| 261 |
"AQuA": {
|
| 262 |
-
"Score":
|
| 263 |
-
"Pass rate":
|
| 264 |
"X-shot": 0,
|
| 265 |
"Parameters": "",
|
| 266 |
"Samples": 254,
|
| 267 |
-
"Total input tokens":
|
| 268 |
-
"Average input tokens":
|
| 269 |
-
"Total output tokens":
|
| 270 |
-
"Average output tokens":
|
| 271 |
-
"All tokens":
|
| 272 |
-
"Cost($)": 0.
|
| 273 |
}
|
| 274 |
}
|
| 275 |
},
|
|
|
|
| 226 |
"Cost($)": 0.6902
|
| 227 |
},
|
| 228 |
"AQuA": {
|
| 229 |
+
"Score": 59.45,
|
| 230 |
+
"Pass rate": 100,
|
| 231 |
"X-shot": 0,
|
| 232 |
"Parameters": "",
|
| 233 |
"Samples": 254,
|
| 234 |
+
"Total input tokens": 225162,
|
| 235 |
+
"Average input tokens": 886,
|
| 236 |
+
"Total output tokens": 41492,
|
| 237 |
+
"Average output tokens": 163,
|
| 238 |
+
"All tokens": 266654,
|
| 239 |
+
"Cost($)": 0.1748
|
| 240 |
}
|
| 241 |
},
|
| 242 |
"Doubao-lite-32k": {
|
|
|
|
| 246 |
"Eval Date": "2025/01/07"
|
| 247 |
},
|
| 248 |
"gsm8k": {
|
| 249 |
+
"Score": 79.61,
|
| 250 |
+
"Pass rate": 92.57,
|
| 251 |
"X-shot": 8,
|
| 252 |
"Parameters": "",
|
| 253 |
"Samples": 1319,
|
| 254 |
"Total input tokens": 1170038,
|
| 255 |
"Average input tokens": 887,
|
| 256 |
+
"Total output tokens": 118017,
|
| 257 |
"Average output tokens": 89,
|
| 258 |
+
"All tokens": 1288055,
|
| 259 |
"Cost($)": 0.0575
|
| 260 |
},
|
| 261 |
"AQuA": {
|
| 262 |
+
"Score": 71.65,
|
| 263 |
+
"Pass rate": 96.85,
|
| 264 |
"X-shot": 0,
|
| 265 |
"Parameters": "",
|
| 266 |
"Samples": 254,
|
| 267 |
+
"Total input tokens": 259863,
|
| 268 |
+
"Average input tokens": 1023,
|
| 269 |
+
"Total output tokens": 49573,
|
| 270 |
+
"Average output tokens": 195,
|
| 271 |
+
"All tokens": 309436,
|
| 272 |
+
"Cost($)": 0.0147
|
| 273 |
}
|
| 274 |
}
|
| 275 |
},
|
src/overall_math_score.json
CHANGED
|
@@ -57,8 +57,8 @@
|
|
| 57 |
"Cost($)": 0.6902
|
| 58 |
},
|
| 59 |
"AQuA": {
|
| 60 |
-
"Score":
|
| 61 |
-
"Cost($)": 0.
|
| 62 |
}
|
| 63 |
},
|
| 64 |
"ReAct-Pro*": {
|
|
@@ -128,12 +128,12 @@
|
|
| 128 |
"Eval Date": "2025/01/07"
|
| 129 |
},
|
| 130 |
"gsm8k": {
|
| 131 |
-
"Score": 79.
|
| 132 |
-
"Cost($)": 0.
|
| 133 |
},
|
| 134 |
"AQuA": {
|
| 135 |
-
"Score":
|
| 136 |
-
"Cost($)": 0.
|
| 137 |
}
|
| 138 |
},
|
| 139 |
"ReAct-Pro-Doubao": {
|
|
|
|
| 57 |
"Cost($)": 0.6902
|
| 58 |
},
|
| 59 |
"AQuA": {
|
| 60 |
+
"Score": 59.45,
|
| 61 |
+
"Cost($)": 0.1748
|
| 62 |
}
|
| 63 |
},
|
| 64 |
"ReAct-Pro*": {
|
|
|
|
| 128 |
"Eval Date": "2025/01/07"
|
| 129 |
},
|
| 130 |
"gsm8k": {
|
| 131 |
+
"Score": 79.61,
|
| 132 |
+
"Cost($)": 0.0576
|
| 133 |
},
|
| 134 |
"AQuA": {
|
| 135 |
+
"Score": 71.65,
|
| 136 |
+
"Cost($)": 0.0147
|
| 137 |
}
|
| 138 |
},
|
| 139 |
"ReAct-Pro-Doubao": {
|