Spaces:
Running
Running
Smita R
Smita
commited on
re-ordering evals (#41)
Browse filesCo-authored-by: Smita <>
- app.py +1 -1
- leaderboard_transformer.py +6 -6
app.py
CHANGED
|
@@ -145,7 +145,7 @@ with demo.route("Code & Execution", "/code-execution"):
|
|
| 145 |
with demo.route("Data Analysis", "/data-analysis"):
|
| 146 |
build_data_analysis_page()
|
| 147 |
|
| 148 |
-
with demo.route("Discovery", "/discovery"):
|
| 149 |
build_e2e_page()
|
| 150 |
|
| 151 |
with demo.route("About", "/about"):
|
|
|
|
| 145 |
with demo.route("Data Analysis", "/data-analysis"):
|
| 146 |
build_data_analysis_page()
|
| 147 |
|
| 148 |
+
with demo.route("End-to-End Discovery", "/discovery"):
|
| 149 |
build_e2e_page()
|
| 150 |
|
| 151 |
with demo.route("About", "/about"):
|
leaderboard_transformer.py
CHANGED
|
@@ -16,7 +16,7 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
|
|
| 16 |
"discovery": "Discovery",
|
| 17 |
|
| 18 |
# Validation Names
|
| 19 |
-
"arxivdigestables_validation": "
|
| 20 |
"sqa_dev": "ScholarQA-CS2",
|
| 21 |
"litqa2_validation": "LitQA2-FullText",
|
| 22 |
"paper_finder_validation": "PaperFindingBench",
|
|
@@ -26,19 +26,19 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
|
|
| 26 |
"ds1000_validation": "DS-1000",
|
| 27 |
"e2e_discovery_validation": "E2E-Bench",
|
| 28 |
"e2e_discovery_hard_validation": "E2E-Bench-Hard",
|
| 29 |
-
"super_validation": "SUPER",
|
| 30 |
# Test Names
|
| 31 |
"paper_finder_test": "PaperFindingBench",
|
| 32 |
"paper_finder_litqa2_test": "LitQA2-FullText-Search",
|
| 33 |
"sqa_test": "ScholarQA-CS2",
|
| 34 |
-
"arxivdigestables_test": "
|
| 35 |
"litqa2_test": "LitQA2-FullText",
|
| 36 |
"discoverybench_test": "DiscoveryBench",
|
| 37 |
"core_bench_test": "CORE-Bench-Hard",
|
| 38 |
"ds1000_test": "DS-1000",
|
| 39 |
"e2e_discovery_test": "E2E-Bench",
|
| 40 |
"e2e_discovery_hard_test": "E2E-Bench-Hard",
|
| 41 |
-
"super_test": "SUPER",
|
| 42 |
}
|
| 43 |
ORDER_MAP = {
|
| 44 |
'Literature Understanding': [
|
|
@@ -46,11 +46,11 @@ ORDER_MAP = {
|
|
| 46 |
'LitQA2-FullText-Search',
|
| 47 |
'ScholarQA-CS2',
|
| 48 |
'LitQA2-FullText',
|
| 49 |
-
'
|
| 50 |
],
|
| 51 |
'Code Execution': [
|
|
|
|
| 52 |
'CORE-Bench-Hard',
|
| 53 |
-
'SUPER',
|
| 54 |
'DS-1000'
|
| 55 |
],
|
| 56 |
# Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories
|
|
|
|
| 16 |
"discovery": "Discovery",
|
| 17 |
|
| 18 |
# Validation Names
|
| 19 |
+
"arxivdigestables_validation": "ArxivDIGESTables-Clean",
|
| 20 |
"sqa_dev": "ScholarQA-CS2",
|
| 21 |
"litqa2_validation": "LitQA2-FullText",
|
| 22 |
"paper_finder_validation": "PaperFindingBench",
|
|
|
|
| 26 |
"ds1000_validation": "DS-1000",
|
| 27 |
"e2e_discovery_validation": "E2E-Bench",
|
| 28 |
"e2e_discovery_hard_validation": "E2E-Bench-Hard",
|
| 29 |
+
"super_validation": "SUPER-Expert",
|
| 30 |
# Test Names
|
| 31 |
"paper_finder_test": "PaperFindingBench",
|
| 32 |
"paper_finder_litqa2_test": "LitQA2-FullText-Search",
|
| 33 |
"sqa_test": "ScholarQA-CS2",
|
| 34 |
+
"arxivdigestables_test": "ArxivDIGESTables-Clean",
|
| 35 |
"litqa2_test": "LitQA2-FullText",
|
| 36 |
"discoverybench_test": "DiscoveryBench",
|
| 37 |
"core_bench_test": "CORE-Bench-Hard",
|
| 38 |
"ds1000_test": "DS-1000",
|
| 39 |
"e2e_discovery_test": "E2E-Bench",
|
| 40 |
"e2e_discovery_hard_test": "E2E-Bench-Hard",
|
| 41 |
+
"super_test": "SUPER-Expert",
|
| 42 |
}
|
| 43 |
ORDER_MAP = {
|
| 44 |
'Literature Understanding': [
|
|
|
|
| 46 |
'LitQA2-FullText-Search',
|
| 47 |
'ScholarQA-CS2',
|
| 48 |
'LitQA2-FullText',
|
| 49 |
+
'ArxivDIGESTables-Clean'
|
| 50 |
],
|
| 51 |
'Code Execution': [
|
| 52 |
+
'SUPER-Expert',
|
| 53 |
'CORE-Bench-Hard',
|
|
|
|
| 54 |
'DS-1000'
|
| 55 |
],
|
| 56 |
# Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories
|