o1-2024-12-17 |
OpenAI |
75.67 |
91.58 |
69.69 |
80.32 |
65.47 |
65.39 |
81.55 |
deepseek-r1 |
DeepSeek |
71.38 |
83.17 |
66.74 |
79.54 |
69.78 |
48.53 |
80.51 |
gemini-2.0-flash-thinking-exp-01-21 |
Google |
66.89 |
78.17 |
53.49 |
75.69 |
69.37 |
42.18 |
82.47 |
o1-preview-2024-09-12 |
OpenAI |
65.79 |
67.42 |
50.85 |
65.49 |
67.69 |
68.72 |
74.60 |
gemini-exp-1206 |
Google |
64.09 |
57.00 |
63.41 |
72.36 |
63.16 |
51.29 |
77.34 |
gemini-2.0-flash-thinking-exp-1219 |
Google |
61.83 |
64.58 |
53.13 |
69.03 |
68.11 |
36.83 |
79.32 |
deepseek-v3 |
DeepSeek |
60.45 |
56.75 |
61.77 |
60.54 |
60.94 |
47.48 |
75.25 |
gemini-2.0-flash-exp |
Google |
59.26 |
59.08 |
54.36 |
60.39 |
61.67 |
38.22 |
81.86 |
claude-3-5-sonnet-20241022 |
Anthropic |
59.03 |
56.67 |
67.13 |
52.28 |
55.03 |
53.76 |
69.30 |
claude-3-5-sonnet-20240620 |
Anthropic |
58.74 |
57.17 |
60.85 |
54.32 |
58.87 |
53.21 |
68.01 |
o1-mini-2024-09-12 |
OpenAI |
57.76 |
72.33 |
48.05 |
61.99 |
57.92 |
40.89 |
65.40 |
gemini-exp-1121 |
Google |
57.36 |
49.92 |
49.75 |
63.75 |
60.29 |
40.30 |
80.15 |
step-2-16k-202411 |
StepFun |
56.02 |
52.17 |
47.19 |
48.77 |
63.72 |
44.39 |
79.88 |
gpt-4o-2024-08-06 |
OpenAI |
55.33 |
53.92 |
51.44 |
49.54 |
60.91 |
47.59 |
68.58 |
gpt-4o-2024-05-13 |
OpenAI |
54.41 |
49.67 |
50.00 |
46.98 |
61.57 |
50.05 |
68.21 |
gemini-1.5-pro-002 |
Google |
54.33 |
49.08 |
48.80 |
59.07 |
54.97 |
43.29 |
70.78 |
grok-2-1212 |
xAI |
54.30 |
54.83 |
46.44 |
54.88 |
54.45 |
45.58 |
69.63 |
gemini-1.5-pro-exp-0827 |
Google |
53.29 |
50.92 |
41.43 |
58.50 |
53.50 |
46.15 |
69.26 |
meta-llama-3.1-405b-instruct-turbo |
Meta |
52.36 |
53.25 |
42.65 |
41.05 |
55.85 |
45.46 |
75.90 |
gpt-4o-2024-11-20 |
OpenAI |
52.19 |
55.75 |
46.08 |
42.87 |
56.15 |
47.37 |
64.94 |
learnlm-1.5-pro-experimental |
Google |
52.19 |
43.42 |
46.87 |
57.77 |
54.97 |
41.98 |
68.16 |
chatgpt-4o-latest-0903 |
OpenAI |
51.66 |
50.50 |
47.44 |
42.45 |
57.93 |
45.30 |
66.37 |
qwen2.5-72b-instruct-turbo |
Alibaba |
51.44 |
45.42 |
57.64 |
54.29 |
51.91 |
34.99 |
64.39 |
dracarys2-72b-instruct |
AbacusAI |
51.05 |
45.83 |
56.64 |
53.98 |
51.14 |
33.48 |
65.22 |
gpt-4-turbo-2024-04-09 |
OpenAI |
50.40 |
50.92 |
49.00 |
43.02 |
54.36 |
44.26 |
60.85 |
llama-3.3-70b-instruct-turbo |
Meta |
50.16 |
50.75 |
36.59 |
42.24 |
49.49 |
39.20 |
82.67 |
grok-beta |
xAI |
49.18 |
37.00 |
45.15 |
45.84 |
54.27 |
43.16 |
69.62 |
claude-3-opus-20240229 |
Anthropic |
49.12 |
40.58 |
38.59 |
43.36 |
57.89 |
50.39 |
63.89 |
gemini-1.5-flash-002 |
Google |
48.59 |
47.00 |
41.87 |
47.63 |
48.35 |
27.92 |
78.76 |
mistral-large-2411 |
Mistral AI |
48.43 |
43.50 |
47.08 |
42.55 |
50.15 |
39.39 |
67.93 |
mistral-large-2407 |
Mistral AI |
48.31 |
41.67 |
47.08 |
44.69 |
53.16 |
39.52 |
63.73 |
qwen2.5-coder-32b-instruct |
Alibaba |
46.23 |
42.08 |
56.85 |
46.61 |
49.87 |
23.25 |
58.69 |
deepseek-v2.5-1210 |
DeepSeek |
45.98 |
40.17 |
46.09 |
51.60 |
48.45 |
31.14 |
58.40 |
gpt-4-0125-preview |
OpenAI |
45.71 |
47.17 |
41.80 |
32.05 |
56.83 |
39.22 |
57.19 |
gemini-1.5-flash-exp-0827 |
Google |
45.21 |
46.33 |
40.35 |
30.60 |
51.40 |
29.60 |
72.97 |
meta-llama-3.1-70b-instruct-turbo |
Meta |
44.89 |
43.00 |
33.49 |
34.72 |
53.75 |
35.42 |
68.98 |
gemini-1.5-pro-001 |
Google |
44.22 |
37.00 |
32.31 |
40.33 |
55.07 |
40.36 |
60.24 |
amazon.nova-pro-v1:0 |
Amazon |
43.55 |
32.58 |
38.15 |
38.14 |
48.31 |
36.96 |
67.13 |
claude-3-5-haiku-20241022 |
Anthropic |
43.45 |
28.08 |
51.36 |
35.54 |
48.45 |
35.37 |
61.88 |
deepseek-r1-distill-qwen-32b |
DeepSeek |
42.89 |
52.25 |
32.85 |
59.12 |
45.41 |
26.82 |
40.92 |
phi-4 |
Microsoft |
41.61 |
47.83 |
30.67 |
41.98 |
45.17 |
25.61 |
58.38 |
gpt-4o-mini-2024-07-18 |
OpenAI |
41.26 |
32.75 |
43.15 |
36.31 |
49.96 |
28.61 |
56.80 |
qwq-32b-preview |
Alibaba |
39.90 |
57.71 |
37.20 |
56.21 |
31.62 |
21.09 |
35.59 |
gemini-1.5-flash-001 |
Google |
39.22 |
34.25 |
34.31 |
32.59 |
49.87 |
31.71 |
52.58 |
gemma-2-27b-it |
Google |
38.19 |
28.08 |
35.95 |
26.52 |
47.87 |
32.62 |
58.10 |
gemini-1.5-flash-8b-exp-0827 |
Google |
36.67 |
35.00 |
28.74 |
28.12 |
37.32 |
20.80 |
70.02 |
amazon.nova-lite-v1:0 |
Amazon |
36.35 |
36.67 |
27.46 |
36.70 |
37.23 |
25.93 |
54.13 |
gemini-1.5-flash-8b-exp-0924 |
Google |
36.01 |
23.75 |
28.67 |
31.66 |
42.28 |
19.13 |
70.55 |
qwen2.5-7b-instruct-turbo |
Alibaba |
34.90 |
28.42 |
38.37 |
39.49 |
35.22 |
15.80 |
52.11 |
claude-3-haiku-20240307 |
Anthropic |
33.85 |
26.33 |
24.46 |
23.37 |
44.47 |
29.13 |
55.32 |
mistral-small-2409 |
Mistral AI |
33.39 |
29.92 |
25.74 |
24.25 |
42.73 |
24.49 |
53.23 |
mixtral-8x22b-instruct-v0.1 |
Mistral AI |
32.45 |
26.33 |
32.03 |
26.57 |
35.67 |
21.81 |
52.32 |
command-r-plus-08-2024 |
Cohere |
31.76 |
24.75 |
19.14 |
21.27 |
38.06 |
29.73 |
57.61 |
amazon.nova-micro-v1:0 |
Amazon |
29.56 |
25.08 |
20.18 |
34.35 |
33.95 |
15.78 |
48.04 |
gemma-2-9b-it |
Google |
28.66 |
15.17 |
22.46 |
19.80 |
36.39 |
25.53 |
52.62 |
mistral-small-2402 |
Mistral AI |
28.36 |
19.17 |
21.18 |
19.92 |
34.59 |
18.89 |
56.40 |
command-r-08-2024 |
Cohere |
27.31 |
21.92 |
17.90 |
18.36 |
33.34 |
16.72 |
55.62 |
command-r-plus-04-2024 |
Cohere |
27.11 |
20.58 |
19.46 |
17.99 |
25.48 |
19.70 |
59.47 |
meta-llama-3.1-8b-instruct-turbo |
Meta |
25.97 |
13.33 |
18.74 |
18.31 |
32.82 |
17.71 |
54.90 |
phi-3-small-8k-instruct |
Microsoft |
24.03 |
15.92 |
20.26 |
17.58 |
30.29 |
12.94 |
47.20 |
phi-3-mini-128k-instruct |
Microsoft |
22.36 |
20.50 |
15.04 |
15.72 |
34.69 |
9.15 |
39.08 |
olmo-2-1124-13b-instruct |
AllenAI |
22.09 |
16.33 |
10.41 |
13.51 |
20.60 |
11.16 |
60.56 |
phi-3-mini-4k-instruct |
Microsoft |
22.08 |
26.83 |
15.54 |
14.96 |
30.21 |
8.56 |
36.36 |