Smita R Smita commited on
Commit
ad46ea8
·
unverified ·
1 Parent(s): f48fa14

re-ordering evals (#41)

Browse files

Co-authored-by: Smita <>

Files changed (2) hide show
  1. app.py +1 -1
  2. leaderboard_transformer.py +6 -6
app.py CHANGED
@@ -145,7 +145,7 @@ with demo.route("Code & Execution", "/code-execution"):
145
  with demo.route("Data Analysis", "/data-analysis"):
146
  build_data_analysis_page()
147
 
148
- with demo.route("Discovery", "/discovery"):
149
  build_e2e_page()
150
 
151
  with demo.route("About", "/about"):
 
145
  with demo.route("Data Analysis", "/data-analysis"):
146
  build_data_analysis_page()
147
 
148
+ with demo.route("End-to-End Discovery", "/discovery"):
149
  build_e2e_page()
150
 
151
  with demo.route("About", "/about"):
leaderboard_transformer.py CHANGED
@@ -16,7 +16,7 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
16
  "discovery": "Discovery",
17
 
18
  # Validation Names
19
- "arxivdigestables_validation": "ArxivDIGES Tables",
20
  "sqa_dev": "ScholarQA-CS2",
21
  "litqa2_validation": "LitQA2-FullText",
22
  "paper_finder_validation": "PaperFindingBench",
@@ -26,19 +26,19 @@ INFORMAL_TO_FORMAL_NAME_MAP = {
26
  "ds1000_validation": "DS-1000",
27
  "e2e_discovery_validation": "E2E-Bench",
28
  "e2e_discovery_hard_validation": "E2E-Bench-Hard",
29
- "super_validation": "SUPER",
30
  # Test Names
31
  "paper_finder_test": "PaperFindingBench",
32
  "paper_finder_litqa2_test": "LitQA2-FullText-Search",
33
  "sqa_test": "ScholarQA-CS2",
34
- "arxivdigestables_test": "ArxivDIGES Tables",
35
  "litqa2_test": "LitQA2-FullText",
36
  "discoverybench_test": "DiscoveryBench",
37
  "core_bench_test": "CORE-Bench-Hard",
38
  "ds1000_test": "DS-1000",
39
  "e2e_discovery_test": "E2E-Bench",
40
  "e2e_discovery_hard_test": "E2E-Bench-Hard",
41
- "super_test": "SUPER",
42
  }
43
  ORDER_MAP = {
44
  'Literature Understanding': [
@@ -46,11 +46,11 @@ ORDER_MAP = {
46
  'LitQA2-FullText-Search',
47
  'ScholarQA-CS2',
48
  'LitQA2-FullText',
49
- 'ArxivDIGES Tables'
50
  ],
51
  'Code Execution': [
 
52
  'CORE-Bench-Hard',
53
- 'SUPER',
54
  'DS-1000'
55
  ],
56
  # Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories
 
16
  "discovery": "Discovery",
17
 
18
  # Validation Names
19
+ "arxivdigestables_validation": "ArxivDIGESTables-Clean",
20
  "sqa_dev": "ScholarQA-CS2",
21
  "litqa2_validation": "LitQA2-FullText",
22
  "paper_finder_validation": "PaperFindingBench",
 
26
  "ds1000_validation": "DS-1000",
27
  "e2e_discovery_validation": "E2E-Bench",
28
  "e2e_discovery_hard_validation": "E2E-Bench-Hard",
29
+ "super_validation": "SUPER-Expert",
30
  # Test Names
31
  "paper_finder_test": "PaperFindingBench",
32
  "paper_finder_litqa2_test": "LitQA2-FullText-Search",
33
  "sqa_test": "ScholarQA-CS2",
34
+ "arxivdigestables_test": "ArxivDIGESTables-Clean",
35
  "litqa2_test": "LitQA2-FullText",
36
  "discoverybench_test": "DiscoveryBench",
37
  "core_bench_test": "CORE-Bench-Hard",
38
  "ds1000_test": "DS-1000",
39
  "e2e_discovery_test": "E2E-Bench",
40
  "e2e_discovery_hard_test": "E2E-Bench-Hard",
41
+ "super_test": "SUPER-Expert",
42
  }
43
  ORDER_MAP = {
44
  'Literature Understanding': [
 
46
  'LitQA2-FullText-Search',
47
  'ScholarQA-CS2',
48
  'LitQA2-FullText',
49
+ 'ArxivDIGESTables-Clean'
50
  ],
51
  'Code Execution': [
52
+ 'SUPER-Expert',
53
  'CORE-Bench-Hard',
 
54
  'DS-1000'
55
  ],
56
  # Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories