panuthept commited on
Commit
4fb8a8d
Β·
1 Parent(s): cf7ae80

add more results

Browse files
Files changed (1) hide show
  1. app.py +205 -17
app.py CHANGED
@@ -5,28 +5,81 @@ from css_html_js import custom_css
5
  TITLE = """<h1 align="center" id="space-title">πŸ‡ΉπŸ‡­ Thai Sentence Embedding Leaderboard</h1>"""
6
 
7
  INTRODUCTION_TEXT = """
8
- πŸ“ The πŸ‡ΉπŸ‡­ Thai Sentence Embedding Leaderboard aims to track, rank and evaluate open embedding models on Thai sentence embedding tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
9
- ## Dataset
10
- πŸ“ˆ We evaluate models based on 3 datasets,
11
- 1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
12
- - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
13
- 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
14
- - This test is general test for malay grammar.
15
- 3. General high school science questions, contains 323 questions, https://huggingface.co/datasets/mesolitica/mysoalan.com-qa
16
- - This test is general test for science.
17
- 4. Translated MMLU, https://huggingface.co/datasets/mesolitica/translated-MMLU
18
- - This test is to test general knowledge, originally from MMLU.
19
- ## Contributions
20
- 1. Claude 1.3 and 2.0 Tatabahasa contributed by https://www.linkedin.com/in/fahim-surani
21
- 2. Claude 3.0 contributed by https://github.com/theblackcat102, https://huggingface.co/theblackcat102
22
  ## Tagging
23
- 🟒 Non-LLM β­• LLM πŸ“¦ API
24
  """
25
 
26
  results = [
27
  {
28
  'T': '🟒',
29
- 'model': '[BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  'Model Size (Million Parameters)': 570,
31
  'Embedding Dimensions': 1024,
32
  'Average (8 datasets)': 75.64,
@@ -35,9 +88,141 @@ results = [
35
  'PairClassification (1 datasets)': 79.02,
36
  'Retrieval (3 datasets)': 91.42,
37
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  {
39
  'T': 'πŸ“¦',
40
- 'model': 'Cohere-embed-multilingual-v3.0',
41
  'Embedding Dimensions': 1024,
42
  'Average (8 datasets)': 74.86,
43
  'STS Average (1 datasets)': 77.87,
@@ -47,6 +232,9 @@ results = [
47
  },
48
  ]
49
 
 
 
 
50
  data = pd.DataFrame(results)
51
 
52
  demo = gr.Blocks(css=custom_css)
 
5
  TITLE = """<h1 align="center" id="space-title">πŸ‡ΉπŸ‡­ Thai Sentence Embedding Leaderboard</h1>"""
6
 
7
  INTRODUCTION_TEXT = """
8
+ πŸ“ The πŸ‡ΉπŸ‡­ Thai Sentence Embedding Leaderboard aims to track, rank and evaluate open embedding models on Thai sentence embedding tasks. Source code for evaluation at https://github.com/mrpeerat/Thai-Sentence-Vector-Benchmark, feel free to submit your own score at https://huggingface.co/spaces/panuthept/thai_sentence_embedding_benchmark/discussions.
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ## Tagging
10
+ 🟒 Open sourced πŸ“¦ API
11
  """
12
 
13
  results = [
14
  {
15
  'T': '🟒',
16
+ 'Model Name': '[XLMR-base](https://huggingface.co/FacebookAI/xlm-roberta-base)',
17
+ 'Model Size (Million Parameters)': 279,
18
+ 'Embedding Dimensions': 768,
19
+ 'Average (8 datasets)': 37.95,
20
+ 'STS Average (1 datasets)': 44.48,
21
+ 'Classification (3 datasets)': 58.42,
22
+ 'PairClassification (1 datasets)': 57.62,
23
+ 'Retrieval (3 datasets)': 5.57,
24
+ },
25
+ {
26
+ 'T': '🟒',
27
+ 'Model Name': '[XLMR-large](https://huggingface.co/FacebookAI/xlm-roberta-large)',
28
+ 'Model Size (Million Parameters)': 561,
29
+ 'Embedding Dimensions': 1024,
30
+ 'Average (8 datasets)': 38.59,
31
+ 'STS Average (1 datasets)': 38.31,
32
+ 'Classification (3 datasets)': 59.51,
33
+ 'PairClassification (1 datasets)': 54.56,
34
+ 'Retrieval (3 datasets)': 11.80,
35
+ },
36
+ {
37
+ 'T': '🟒',
38
+ 'Model Name': '[WangchanBERTa](https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased)',
39
+ 'Model Size (Million Parameters)': 106,
40
+ 'Embedding Dimensions': 768,
41
+ 'Average (8 datasets)': 36.34,
42
+ 'STS Average (1 datasets)': 21.32,
43
+ 'Classification (3 datasets)': 55.46,
44
+ 'PairClassification (1 datasets)': 52.96,
45
+ 'Retrieval (3 datasets)': 19.49,
46
+ },
47
+ {
48
+ 'T': '🟒',
49
+ 'Model Name': '[PhayaThaiBERT](https://huggingface.co/clicknext/phayathaibert)',
50
+ 'Model Size (Million Parameters)': 278,
51
+ 'Embedding Dimensions': 768,
52
+ 'Average (8 datasets)': 55.38,
53
+ 'STS Average (1 datasets)': 51.56,
54
+ 'Classification (3 datasets)': 59.90,
55
+ 'PairClassification (1 datasets)': 59.67,
56
+ 'Retrieval (3 datasets)': 56.31,
57
+ },
58
+ {
59
+ 'T': '🟒',
60
+ 'Model Name': '[MPNet-multilingual](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2)',
61
+ 'Model Size (Million Parameters)': 278,
62
+ 'Embedding Dimensions': 768,
63
+ 'Average (8 datasets)': 66.14,
64
+ 'STS Average (1 datasets)': 80.49,
65
+ 'Classification (3 datasets)': 56.89,
66
+ 'PairClassification (1 datasets)': 84.14,
67
+ 'Retrieval (3 datasets)': 64.13,
68
+ },
69
+ {
70
+ 'T': '🟒',
71
+ 'Model Name': '[DistilUSE-multilingual](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2)',
72
+ 'Model Size (Million Parameters)': 135,
73
+ 'Embedding Dimensions': 512,
74
+ 'Average (8 datasets)': 51.45,
75
+ 'STS Average (1 datasets)': 65.37,
76
+ 'Classification (3 datasets)': 50.93,
77
+ 'PairClassification (1 datasets)': 65.94,
78
+ 'Retrieval (3 datasets)': 42.72,
79
+ },
80
+ {
81
+ 'T': '🟒',
82
+ 'Model Name': '[BGE-M3](https://huggingface.co/BAAI/bge-m3)',
83
  'Model Size (Million Parameters)': 570,
84
  'Embedding Dimensions': 1024,
85
  'Average (8 datasets)': 75.64,
 
88
  'PairClassification (1 datasets)': 79.02,
89
  'Retrieval (3 datasets)': 91.42,
90
  },
91
+ {
92
+ 'T': '🟒',
93
+ 'Model Name': '[SimCSE-XLMR-base](https://huggingface.co/kornwtp/simcse-model-XLMR)',
94
+ 'Model Size (Million Parameters)': 279,
95
+ 'Embedding Dimensions': 768,
96
+ 'Average (8 datasets)': 53.83,
97
+ 'STS Average (1 datasets)': 63.98,
98
+ 'Classification (3 datasets)': 49.44,
99
+ 'PairClassification (1 datasets)': 61.87,
100
+ 'Retrieval (3 datasets)': 54.17,
101
+ },
102
+ {
103
+ 'T': '🟒',
104
+ 'Model Name': '[SimCSE-WangchanBERTa](https://huggingface.co/kornwtp/simcse-model-wangchanberta)',
105
+ 'Model Size (Million Parameters)': 106,
106
+ 'Embedding Dimensions': 768,
107
+ 'Average (8 datasets)': 54.01,
108
+ 'STS Average (1 datasets)': 60.73,
109
+ 'Classification (3 datasets)': 56.71,
110
+ 'PairClassification (1 datasets)': 59.14,
111
+ 'Retrieval (3 datasets)': 51.05,
112
+ },
113
+ {
114
+ 'T': '🟒',
115
+ 'Model Name': '[SimCSE-PhayaThaiBERT](https://huggingface.co/kornwtp/simcse-model-phayathaibert)',
116
+ 'Model Size (Million Parameters)': 278,
117
+ 'Embedding Dimensions': 768,
118
+ 'Average (8 datasets)': 60.02,
119
+ 'STS Average (1 datasets)': None,
120
+ 'Classification (3 datasets)': None,
121
+ 'PairClassification (1 datasets)': None,
122
+ 'Retrieval (3 datasets)': None,
123
+ },
124
+ {
125
+ 'T': '🟒',
126
+ 'Model Name': '[SCT-XLMR-base](https://huggingface.co/kornwtp/SCT-model-XLMR)',
127
+ 'Model Size (Million Parameters)': 279,
128
+ 'Embedding Dimensions': 768,
129
+ 'Average (8 datasets)': 57.69,
130
+ 'STS Average (1 datasets)': None,
131
+ 'Classification (3 datasets)': None,
132
+ 'PairClassification (1 datasets)': None,
133
+ 'Retrieval (3 datasets)': None,
134
+ },
135
+ {
136
+ 'T': '🟒',
137
+ 'Model Name': '[SCT-WangchanBERTa](https://huggingface.co/kornwtp/SCT-model-wangchanberta)',
138
+ 'Model Size (Million Parameters)': 106,
139
+ 'Embedding Dimensions': 768,
140
+ 'Average (8 datasets)': 62.22,
141
+ 'STS Average (1 datasets)': None,
142
+ 'Classification (3 datasets)': None,
143
+ 'PairClassification (1 datasets)': None,
144
+ 'Retrieval (3 datasets)': None,
145
+ },
146
+ {
147
+ 'T': '🟒',
148
+ 'Model Name': '[SCT-PhayaThaiBERT](https://huggingface.co/kornwtp/SCT-model-phayathaibert)',
149
+ 'Model Size (Million Parameters)': 278,
150
+ 'Embedding Dimensions': 768,
151
+ 'Average (8 datasets)': 63.28,
152
+ 'STS Average (1 datasets)': None,
153
+ 'Classification (3 datasets)': None,
154
+ 'PairClassification (1 datasets)': None,
155
+ 'Retrieval (3 datasets)': None,
156
+ },
157
+ {
158
+ 'T': '🟒',
159
+ 'Model Name': '[SCT-KD-XLMR-base](https://huggingface.co/kornwtp/SCT-KD-model-XLMR)',
160
+ 'Model Size (Million Parameters)': 279,
161
+ 'Embedding Dimensions': 768,
162
+ 'Average (8 datasets)': 65.37,
163
+ 'STS Average (1 datasets)': None,
164
+ 'Classification (3 datasets)': None,
165
+ 'PairClassification (1 datasets)': None,
166
+ 'Retrieval (3 datasets)': None,
167
+ },
168
+ {
169
+ 'T': '🟒',
170
+ 'Model Name': '[SCT-KD-WangchanBERTa](https://huggingface.co/kornwtp/SCT-KD-model-wangchanberta)',
171
+ 'Model Size (Million Parameters)': 106,
172
+ 'Embedding Dimensions': 768,
173
+ 'Average (8 datasets)': 63.55,
174
+ 'STS Average (1 datasets)': None,
175
+ 'Classification (3 datasets)': None,
176
+ 'PairClassification (1 datasets)': None,
177
+ 'Retrieval (3 datasets)': None,
178
+ },
179
+ {
180
+ 'T': '🟒',
181
+ 'Model Name': '[SCT-KD-PhayaThaiBERT](https://huggingface.co/kornwtp/SCT-KD-model-phayathaibert)',
182
+ 'Model Size (Million Parameters)': 278,
183
+ 'Embedding Dimensions': 768,
184
+ 'Average (8 datasets)': 66.00,
185
+ 'STS Average (1 datasets)': None,
186
+ 'Classification (3 datasets)': None,
187
+ 'PairClassification (1 datasets)': None,
188
+ 'Retrieval (3 datasets)': None,
189
+ },
190
+ {
191
+ 'T': '🟒',
192
+ 'Model Name': '[ConGen-XLMR-base](https://huggingface.co/kornwtp/ConGen-model-XLMR)',
193
+ 'Model Size (Million Parameters)': 279,
194
+ 'Embedding Dimensions': 768,
195
+ 'Average (8 datasets)': 66.84,
196
+ 'STS Average (1 datasets)': None,
197
+ 'Classification (3 datasets)': None,
198
+ 'PairClassification (1 datasets)': None,
199
+ 'Retrieval (3 datasets)': None,
200
+ },
201
+ {
202
+ 'T': '🟒',
203
+ 'Model Name': '[ConGen-WangchanBERTa](https://huggingface.co/kornwtp/ConGen-model-wangchanberta)',
204
+ 'Model Size (Million Parameters)': 106,
205
+ 'Embedding Dimensions': 768,
206
+ 'Average (8 datasets)': 67.17,
207
+ 'STS Average (1 datasets)': None,
208
+ 'Classification (3 datasets)': None,
209
+ 'PairClassification (1 datasets)': None,
210
+ 'Retrieval (3 datasets)': None,
211
+ },
212
+ {
213
+ 'T': '🟒',
214
+ 'Model Name': '[ConGen-PhayaThaiBERT](https://huggingface.co/kornwtp/ConGen-model-phayathaibert)',
215
+ 'Model Size (Million Parameters)': 278,
216
+ 'Embedding Dimensions': 768,
217
+ 'Average (8 datasets)': 66.94,
218
+ 'STS Average (1 datasets)': None,
219
+ 'Classification (3 datasets)': None,
220
+ 'PairClassification (1 datasets)': None,
221
+ 'Retrieval (3 datasets)': None,
222
+ },
223
  {
224
  'T': 'πŸ“¦',
225
+ 'Model Name': 'Cohere-embed-multilingual-v3.0',
226
  'Embedding Dimensions': 1024,
227
  'Average (8 datasets)': 74.86,
228
  'STS Average (1 datasets)': 77.87,
 
232
  },
233
  ]
234
 
235
+ # Sort by average
236
+ results = sorted(results, key=lambda x: x['Average (8 datasets)'], reverse=True)
237
+
238
  data = pd.DataFrame(results)
239
 
240
  demo = gr.Blocks(css=custom_css)