1 | MiniMax-Text-01 | 456B | OpenSource | 83.55 | 89.37 | 80.92 | 58.08 | 88.46 | 68.55 | 55.12 | 76.80 | 10.00 | 83.86 | 88.92 | 75.09 | 89.02 | 36.25 | 22.97 |
2 | Claude 3.5 Sonnet 20241022 | N/A | API | 85.58 | - | 89.78 | 57.58 | - | - | - | 76.80 | 13.33 | - | - | 77.92 | 92.68 | 65.00 | - |
3 | Claude 3.7 Sonnet 20250219 | N/A | API | 87.25 | - | 77.62 | 67.68 | - | - | - | 79.80 | 30.00 | - | - | 80.74 | 93.90 | 63.25 | - |
4 | DeepSeek-V2.5 | 236B | OpenSource | 77.63 | 81.38 | 80.72 | 42.42 | 90.70 | 74.54 | 46.88 | 74.60 | 13.33 | 78.57 | 82.38 | 65.19 | 87.20 | 52.00 | 20.27 |
5 | DeepSeek-V2.5-1210 | 236B | OpenSource | 79.30 | 89.08 | 85.10 | 46.97 | 91.23 | 62.78 | 58.88 | 84.60 | 16.67 | 80.23 | 83.28 | 67.15 | 87.80 | 58.25 | 22.30 |
6 | DeepSeek-V3 | 671B | OpenSource | 85.77 | 92.12 | 81.55 | 57.07 | 84.67 | 73.98 | 63.36 | 89.60 | 26.67 | 85.22 | 88.96 | 75.93 | 92.07 | 65.50 | 26.35 |
7 | DeepSeek-R1 | 671B | OpenSource | 83.36 | - | 90.89 | 69.70 | - | - | - | 97.60 | 86.67 | - | - | 83.60 | 98.17 | 83.75 | - |
8 | DeepSeek-R1-Distill-Llama-8B | 8B | OpenSource | 66.17 | - | 75.77 | 20.71 | - | - | - | 82.80 | 40.00 | - | - | 44.50 | 75.61 | 45.75 | - |
9 | DeepSeek-R1-Distill-Llama-70B | 70B | OpenSource | 79.85 | - | 90.00 | 58.59 | - | - | - | 94.20 | 56.67 | - | - | 72.97 | 95.73 | 70.25 | - |
10 | DeepSeek-R1-Distill-Qwen-1.5B | 1.5B | OpenSource | 36.04 | - | 43.51 | 7.07 | - | - | - | 69.60 | 16.67 | - | - | 21.46 | 56.10 | 18.25 | - |
11 | DeepSeek-R1-Distill-Qwen-7B | 7B | OpenSource | 57.67 | - | 69.61 | 27.78 | - | - | - | 87.40 | 43.33 | - | - | 43.95 | 79.88 | 43.50 | - |
12 | DeepSeek-R1-Distill-Qwen-14B | 14B | OpenSource | 75.23 | - | 86.38 | 44.95 | - | - | - | 90.80 | 60.00 | - | - | 65.36 | 92.68 | 65.50 | - |
13 | DeepSeek-R1-Distill-Qwen-32B | 32B | OpenSource | 73.75 | - | 88.15 | 43.43 | - | - | - | 90.20 | 60.00 | - | - | 66.05 | 90.24 | 61.50 | - |
14 | Doubao-pro-32k-241215 | N/A | API | 82.44 | - | 82.99 | 52.53 | - | - | - | 81.40 | 26.67 | - | - | 73.74 | 87.20 | 56.75 | - |
15 | Doubao-pro-32k-240828 | N/A | API | 77.63 | - | 77.87 | 45.96 | - | - | - | 84.40 | 23.33 | - | - | 70.50 | 82.32 | 43.00 | - |
16 | Doubao-1.5-pro-32k-250115 | N/A | API | 85.21 | - | 86.28 | 58.08 | - | - | - | 90.40 | 30.00 | - | - | 78.46 | 90.85 | 62.75 | - |
17 | Gemini-1.5-Pro-Latest | N/A | API | 86.69 | - | 84.27 | 60.10 | - | - | - | 86.20 | 33.33 | - | - | 75.12 | 87.80 | 49.50 | - |
18 | Gemini-2.0-Flash-Exp | N/A | API | 87.99 | - | 85.60 | 59.60 | - | - | - | 93.20 | 30.00 | - | - | 74.95 | 89.63 | 50.50 | - |
19 | Gemma-2-27B-it | 27B | OpenSource | 78.19 | 83.33 | 68.53 | 39.90 | 86.83 | 60.71 | 45.36 | 57.00 | 0.00 | 62.77 | 77.84 | 58.08 | 74.39 | 29.50 | 16.89 |
20 | Gemma-2-9B-it | 9B | OpenSource | 72.46 | 80.57 | 63.22 | 30.81 | 85.27 | 57.27 | 41.44 | 50.40 | 0.00 | 57.16 | 74.56 | 50.44 | 59.76 | 21.75 | 8.78 |
21 | GLM-4-Plus | N/A | API | 79.11 | - | 83.17 | 46.46 | - | - | - | 74.80 | 3.33 | - | - | 70.24 | 85.37 | 43.50 | - |
22 | GLM-4-9B-Chat | 9B | OpenSource | 69.32 | - | 51.38 | 26.26 | - | - | - | 53.20 | 3.33 | - | - | 48.86 | 75.61 | 17.75 | - |
23 | GPT-4o-20241120 | N/A | API | 79.30 | - | 86.36 | 50.00 | - | - | - | 77.60 | 20.00 | - | - | 65.60 | 93.90 | 51.00 | - |
24 | GPT-4o-20240806 | N/A | API | 83.92 | - | 87.47 | 56.57 | - | - | - | 79.20 | 16.67 | - | - | 74.43 | 91.46 | 46.75 | - |
25 | GPT-4o-mini-20240718 | N/A | API | 79.30 | 85.31 | 80.40 | 43.43 | 89.51 | 50.96 | - | - | 6.67 | 65.80 | 83.03 | 64.53 | 86.59 | 39.00 | 24.32 |
26 | GPT-4.5-Preview-20250227 | N/A | API | 87.99 | - | 76.32 | 69.70 | - | - | - | 85.40 | 43.33 | - | - | 81.12 | 95.12 | 78.00 | - |
27 | o1-mini-2024-09-12 | N/A | API | 75.42 | - | 88.65 | 61.11 | - | - | - | 95.00 | 56.67 | - | - | 73.19 | 96.34 | 75.00 | - |
28 | o3-mini-2025-01-31 | N/A | API | 91.87 | - | 89.97 | 68.69 | - | - | - | 96.20 | 83.33 | - | - | 77.71 | 96.34 | 80.75 | - |
29 | Hunyuan-Standard-256k | N/A | API | 71.90 | - | 66.77 | 15.15 | - | - | - | 60.00 | 3.33 | - | - | 16.41 | 80.49 | 16.75 | - |
30 | InternLM2.5-Chat-7B | 7B | OpenSource | 57.30 | 77.64 | 72.81 | 26.77 | 94.79 | 50.38 | 32.08 | 64.60 | 3.33 | 74.19 | 70.67 | 45.30 | 70.12 | 17.50 | 7.43 |
31 | Llama3.1-405B-Instruct-FP8 | 405B | OpenSource | 84.84 | - | 85.62 | 54.55 | - | - | - | 74.00 | 30.00 | - | - | 73.78 | 87.20 | 44.75 | - |
32 | Llama3.1-70B-Instruct | 70B | OpenSource | 80.22 | - | 81.69 | 44.95 | - | - | - | 67.00 | 20.00 | - | - | 67.99 | 78.66 | 34.00 | - |
33 | Llama3.1-8B-Instruct | 8B | OpenSource | 73.38 | 81.61 | 54.21 | 25.25 | 76.73 | 61.24 | 42.64 | 52.60 | 6.67 | 53.91 | 71.81 | 48.00 | 71.95 | 19.75 | 20.27 |
34 | Llama3.2-3B-Instruct | 3B | OpenSource | 70.98 | 71.14 | 51.55 | 19.70 | 62.88 | 50.29 | 36.40 | 44.80 | 6.67 | 42.67 | 63.76 | 39.50 | 57.93 | 16.75 | 8.78 |
35 | Llama3.3-70B-Instruct | 70B | OpenSource | 88.54 | 90.15 | 84.19 | 50.00 | 89.04 | 73.07 | 52.64 | 76.00 | 20.00 | 73.04 | 86.43 | 71.25 | 87.20 | 40.00 | 25.68 |
36 | Ministral-8B-Instruct-2410 | 8B | OpenSource | 55.08 | - | 57.38 | 28.28 | - | - | - | 55.80 | 3.33 | - | - | 43.72 | 78.05 | 22.75 | - |
37 | Mistral-Large-Instruct-2411 | 123B | OpenSource | 80.59 | - | 83.90 | 48.99 | - | - | - | 72.80 | 6.67 | - | - | 70.94 | 88.41 | 46.25 | - |
38 | Mistral-Small-Instruct-2409 | 22B | OpenSource | 63.03 | - | 67.98 | 39.90 | - | - | - | 59.20 | 3.33 | - | - | 55.60 | 76.83 | 22.25 | - |
39 | Moonshot-v1-32K | N/A | API | 60.26 | - | 65.85 | 32.32 | - | - | - | 65.80 | 20.00 | - | - | 45.18 | 71.95 | 25.75 | - |
40 | Qwen-Max-0919 | N/A | API | 81.70 | - | 86.32 | 46.46 | - | - | - | 83.00 | 23.33 | - | - | 70.25 | 90.85 | 57.25 | - |
41 | Qwen2.5-max | N/A | API | 84.66 | 88.79 | 89.63 | 54.04 | 93.60 | 73.43 | 55.52 | 79.60 | 20.00 | 87.11 | 87.54 | 73.61 | 92.68 | 66.50 | 29.73 |
42 | Qwen2.5-72B-Instruct | 72B | OpenSource | 82.99 | 87.45 | 82.47 | 52.02 | 87.24 | 69.34 | 51.36 | 84.80 | 23.33 | 85.05 | 86.33 | 71.31 | 84.15 | 57.25 | 22.30 |
43 | Qwen2.5-7B-Instruct | 7B | OpenSource | 73.01 | 79.76 | 62.02 | 34.34 | 77.64 | 47.96 | 42.24 | 76.60 | 6.67 | 77.35 | 74.27 | 56.24 | 84.76 | 38.25 | 16.22 |
44 | Qwen2.5-14B-Instruct | 14B | OpenSource | 77.63 | 85.65 | 74.87 | 41.92 | 90.65 | 64.98 | 49.76 | 81.00 | 20.00 | 80.09 | 81.48 | 64.50 | 82.93 | 48.00 | 20.27 |
45 | Qwen2.5-32B-Instruct | 32B | OpenSource | 78.93 | 88.05 | 80.95 | 49.49 | 92.11 | 69.37 | 54.80 | 82.40 | 23.33 | 82.88 | 84.05 | 68.61 | 88.41 | 53.75 | 23.65 |
46 | QwQ-32B | 32B | OpenSource | 81.52 | - | 77.35 | 54.04 | - | - | - | 93.20 | 70.00 | - | - | 73.94 | 98.17 | 90.00 | - |
47 | Step-2-16K | N/A | API | 82.62 | - | 82.76 | 47.98 | - | - | - | 77.60 | 10.00 | - | - | 68.13 | 86.59 | 40.50 | - |
48 | Yi-1.5-9B-Chat | 9B | OpenSource | 52.50 | - | 52.10 | 24.24 | - | - | - | 52.80 | 6.67 | - | - | 46.94 | 67.07 | 22.25 | - |
49 | Yi-Lightning | N/A | API | 79.85 | - | 79.28 | 45.45 | - | - | - | 76.00 | 10.00 | - | - | 65.27 | 83.54 | 44.50 | - |
50 | YAYI-Ultra | N/A | API | 82.99 | - | 84.32 | 47.47 | - | - | - | 84.00 | 23.33 | - | - | 71.67 | 85.98 | 57.00 | - |
51 | Phi-4 | 14B | OpenSource | 63.96 | - | 81.99 | 53.03 | - | - | - | 80.40 | 23.33 | - | - | 72.04 | 86.59 | 36.00 | - |