AZR 提议的项目
所有 AZR 提出的代码示例首先嵌入 jina-embeddings-v2-base-code
,然后用 UMAP 投射到 2D 空间。
将鼠标悬停在点上以查看程序。点击锁定选择。
def f(input_list: list[int]) -> list[int]:
from collections import Counter
state = {}
output = []
for num in input_list:
digit_counts = Counter(str(abs(num)))
even_count = sum([int(digit) % 2 == 0 for digit in digit_counts.keys()])
odd_count = int(digit_counts.total()) - even_count
max_count = max(even_count, odd_count)
state[num] = (even_count, odd_count)
output.append(max_count)
return output
旋转六边形体验评测
提示:编写一个脚本,展示 10 个球在旋转的六边形内弹跳。球受重力和摩擦,必须真实地弹跳在旋转的六边形上。
AZR-Coder-14b (我们的)

import pygame
import math
import random
# Initialize Pygame
pygame.init()
# Set up the display
width, height = 800, 600
screen = pygame.display.set_mode((width, height))
pygame.display.set_caption("Bouncing Balls in a Spinning Hexagon")
# Define colors
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
RED = (255, 0, 0)
# Define constants
FPS = 60
GRAVITY = 0.1
FRICTION = 0.99
BALL_RADIUS = 10
NUM_BALLS = 10
# Define the hexagon
hexagon_center = (width // 2, height // 2)
hexagon_radius = 200
hexagon_vertices = [
(hexagon_center[0] + hexagon_radius * math.cos(2 * math.pi * i / 6),
hexagon_center[1] + hexagon_radius * math.sin(2 * math.pi * i / 6))
for i in range(6)
]
# Define the balls
balls = []
for _ in range(NUM_BALLS):
x = random.randint(BALL_RADIUS, width - BALL_RADIUS)
y = random.randint(BALL_RADIUS, height - BALL_RADIUS)
vx = random.uniform(-5, 5)
vy = random.uniform(-5, 5)
balls.append((x, y, vx, vy, RED))
# Game loop
running = True
clock = pygame.time.Clock()
angle = 0
while running:
# Handle events
for event in pygame.event.get():
if event.type == pygame.QUIT:
running = False
# Clear the screen
screen.fill(WHITE)
# Rotate the hexagon
angle += 0.01
rotated_vertices = [
(hexagon_center[0] + (v[0] - hexagon_center[0]) * math.cos(angle) - (v[1] - hexagon_center[1]) * math.sin(angle),
hexagon_center[1] + (v[0] - hexagon_center[0]) * math.sin(angle) + (v[1] - hexagon_center[1]) * math.cos(angle))
for v in hexagon_vertices
]
# Draw the hexagon
pygame.draw.polygon(screen, BLACK, rotated_vertices)
# Update the positions and velocities of the balls
for i, (x, y, vx, vy, color) in enumerate(balls):
# Apply gravity
vy += GRAVITY
# Apply friction
vx *= FRICTION
vy *= FRICTION
# Update position
x += vx
y += vy
# Check for collisions with the hexagon walls
for j in range(6):
v1 = rotated_vertices[j]
v2 = rotated_vertices[(j + 1) % 6]
dx = v2[0] - v1[0]
dy = v2[1] - v1[1]
dist = math.sqrt(dx**2 + dy**2)
nx = dy / dist
ny = -dx / dist
px = x - v1[0]
py = y - v1[1]
proj = px * nx + py * ny
if 0 <= proj <= dist:
if px * ny - py * nx <= BALL_RADIUS:
overlap = BALL_RADIUS - (px * nx + py * ny)
x -= overlap * nx
y -= overlap * ny
vx = -vx * 0.9
vy = -vy * 0.9
# Check for collisions with the screen edges
if x - BALL_RADIUS < 0 or x + BALL_RADIUS > width:
vx = -vx
if y - BALL_RADIUS < 0 or y + BALL_RADIUS > height:
vy = -vy
# Update the ball's position and velocity
balls[i] = (x, y, vx, vy, color)
# Draw the ball
pygame.draw.circle(screen, color, (int(x), int(y)), BALL_RADIUS)
# Update the display
pygame.display.flip()
clock.tick(FPS)
# Quit Pygame
pygame.quit()
{
"parameters": {
"temperature": 0.6,
"top_p": 0.8
},
"generation_config": {
"do_sample": true
}
}
GPT-4o-mini(GPT-4o 小型版)

import pygame
import random
import math
# Initialize Pygame
pygame.init()
# Screen settings
WIDTH, HEIGHT = 800, 600
screen = pygame.display.set_mode((WIDTH, HEIGHT))
pygame.display.set_caption('Bouncing Balls in Spinning Hexagon')
# Colors
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
BALL_COLOR = (255, 0, 0)
HEX_COLOR = (0, 255, 0)
# Constants
GRAVITY = 0.1 # Gravity affecting the balls
FRICTION = 0.99 # Friction affecting the ball's speed when not bouncing
BALL_RADIUS = 10 # Radius of the balls
NUM_BALLS = 10 # Number of balls
# Hexagon settings
HEX_SIZE = 250 # Hexagon radius
HEX_CENTER = (WIDTH // 2, HEIGHT // 2)
# Ball class
class Ball:
def __init__(self, x, y, radius):
self.x = x
self.y = y
self.radius = radius
self.vx = random.uniform(-2, 2) # Initial x velocity
self.vy = random.uniform(-2, 2) # Initial y velocity
def move(self):
self.x += self.vx
self.y += self.vy
self.vy += GRAVITY # Apply gravity
# Apply friction
self.vx *= FRICTION
self.vy *= FRICTION
def bounce(self, normal):
# Reflect the ball's velocity based on the surface normal (for realistic bouncing)
dot = self.vx * normal[0] + self.vy * normal[1]
self.vx -= 2 * dot * normal[0]
self.vy -= 2 * dot * normal[1]
# Function to draw the hexagon
def draw_hexagon(angle):
points = []
for i in range(6):
x = HEX_CENTER[0] + HEX_SIZE * math.cos(math.radians(i * 60 + angle))
y = HEX_CENTER[1] + HEX_SIZE * math.sin(math.radians(i * 60 + angle))
points.append((x, y))
pygame.draw.polygon(screen, HEX_COLOR, points, 5)
# Function to check if the ball is inside the hexagon
def check_collision_with_hexagon(ball):
angle = pygame.time.get_ticks() / 10 # Time-dependent rotation angle
# Hexagon vertices (rotated)
hex_points = []
for i in range(6):
x = HEX_CENTER[0] + HEX_SIZE * math.cos(math.radians(i * 60 + angle))
y = HEX_CENTER[1] + HEX_SIZE * math.sin(math.radians(i * 60 + angle))
hex_points.append((x, y))
for i in range(6):
p1 = hex_points[i]
p2 = hex_points[(i + 1) % 6]
# Calculate the normal vector of the edge
edge = (p2[0] - p1[0], p2[1] - p1[1])
normal = (-edge[1], edge[0]) # Perpendicular to the edge
# Normalize the normal
length = math.sqrt(normal[0]**2 + normal[1]**2)
normal = (normal[0] / length, normal[1] / length)
# Check if the ball is outside the hexagon (distance from the center to the edge)
ball_to_edge = (ball.x - p1[0], ball.y - p1[1])
dist = ball_to_edge[0] * normal[0] + ball_to_edge[1] * normal[1]
if dist < ball.radius:
ball.bounce(normal)
break
# Create a list of balls
balls = [Ball(random.randint(50, WIDTH-50), random.randint(50, HEIGHT-50), BALL_RADIUS) for _ in range(NUM_BALLS)]
# Main game loop
running = True
clock = pygame.time.Clock()
while running:
screen.fill(BLACK) # Clear screen
# Handle events
for event in pygame.event.get():
if event.type == pygame.QUIT:
running = False
# Draw the spinning hexagon
angle = pygame.time.get_ticks() / 10 # Get a time-based rotation angle
draw_hexagon(angle)
# Update and draw the balls
for ball in balls:
ball.move()
check_collision_with_hexagon(ball)
# Draw ball
pygame.draw.circle(screen, BALL_COLOR, (int(ball.x), int(ball.y)), ball.radius)
# Refresh the display
pygame.display.flip()
# Cap the frame rate
clock.tick(60)
# Quit Pygame
pygame.quit()
{
"query_date": "2025-05-03",
"api_location": "https://chatgpt.com/"
}
Qwen2.5-72B-指导

import pygame
import math
import random
# Initialize Pygame
pygame.init()
# Screen dimensions
WIDTH, HEIGHT = 800, 600
screen = pygame.display.set_mode((WIDTH, HEIGHT))
pygame.display.set_caption("Balls Bouncing in a Spinning Hexagon")
# Colors
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
RED = (255, 0, 0)
# Hexagon properties
hexagon_center = (WIDTH // 2, HEIGHT // 2)
hexagon_radius = 200
hexagon_sides = 6
hexagon_angle = 0 # Initial angle of the hexagon
hexagon_rotation_speed = 0.01 # Rotation speed in radians per frame
# Ball properties
num_balls = 10
balls = []
ball_radius = 10
gravity = 0.5
friction = 0.99
# Function to create a ball
def create_ball():
x = random.randint(ball_radius, WIDTH - ball_radius)
y = random.randint(ball_radius, HEIGHT - ball_radius)
vx = random.uniform(-3, 3)
vy = random.uniform(-3, 3)
return [x, y, vx, vy]
# Create balls
for _ in range(num_balls):
balls.append(create_ball())
# Function to draw the hexagon
def draw_hexagon(angle):
points = []
for i in range(hexagon_sides):
angle_i = angle + i * (2 * math.pi / hexagon_sides)
x = hexagon_center[0] + hexagon_radius * math.cos(angle_i)
y = hexagon_center[1] + hexagon_radius * math.sin(angle_i)
points.append((x, y))
pygame.draw.polygon(screen, RED, points, 3)
# Function to check collision with the hexagon
def check_collision(ball, angle):
x, y, vx, vy = ball
for i in range(hexagon_sides):
angle_i = angle + i * (2 * math.pi / hexagon_sides)
angle_next = angle + (i + 1) * (2 * math.pi / hexagon_sides)
x1 = hexagon_center[0] + hexagon_radius * math.cos(angle_i)
y1 = hexagon_center[1] + hexagon_radius * math.sin(angle_i)
x2 = hexagon_center[0] + hexagon_radius * math.cos(angle_next)
y2 = hexagon_center[1] + hexagon_radius * math.sin(angle_next)
# Check collision with the line segment (x1, y1) to (x2, y2)
dx = x2 - x1
dy = y2 - y1
dot = dx * (x - x1) + dy * (y - y1)
len_sq = dx * dx + dy * dy
param = -1
if len_sq != 0: # in case of a zero length line
param = dot / len_sq
if param < 0:
nx = x1
ny = y1
elif param > 1:
nx = x2
ny = y2
else:
nx = x1 + param * dx
ny = y1 + param * dy
dx = x - nx
dy = y - ny
dist = math.sqrt(dx * dx + dy * dy)
if dist < ball_radius:
# Calculate the normal and tangent vectors
nx = (x - nx) / dist
ny = (y - ny) / dist
tx = -ny
ty = nx
# Reflect the velocity
dot = vx * nx + vy * ny
vx = vx - 2 * dot * nx
vy = vy - 2 * dot * ny
# Move the ball away from the wall
x += nx * (ball_radius - dist)
y += ny * (ball_radius - dist)
ball[0] = x
ball[1] = y
ball[2] = vx
ball[3] = vy
# Main loop
running = True
clock = pygame.time.Clock()
while running:
for event in pygame.event.get():
if event.type == pygame.QUIT:
running = False
# Update ball positions and velocities
for ball in balls:
x, y, vx, vy = ball
vy += gravity
x += vx
y += vy
vx *= friction
vy *= friction
ball[0] = x
ball[1] = y
ball[2] = vx
ball[3] = vy
# Check collision with the hexagon
check_collision(ball, hexagon_angle)
# Update hexagon angle
hexagon_angle += hexagon_rotation_speed
# Draw everything
screen.fill(BLACK)
draw_hexagon(hexagon_angle)
for ball in balls:
pygame.draw.circle(screen, WHITE, (int(ball[0]), int(ball[1])), ball_radius)
pygame.display.flip()
clock.tick(60)
pygame.quit()
{
"api_location": "https://huggingface.co/Qwen/Qwen2.5-Coder-72B-Instruct"
}
Qwen2.5-32B-指导

import pygame
import math
import random
# Initialize Pygame
pygame.init()
# Set up the display
screen_width, screen_height = 800, 600
win = pygame.display.set_mode((screen_width, screen_height))
pygame.display.set_caption("Bouncing Balls Inside Spinning Hexagon")
# Colors
WHITE = (255, 255, 255)
RED = (255, 0, 0)
# Constants for the simulation
FPS = 60
GRAVITY = 0.1
BALL_RADIUS = 10
FRIC_COEFFICIENT = 0.98 # To reduce velocity upon collision.
MAX_ANGLE_DEGREES = 360
DEG_TO_RAD = math.pi / 180 # Conversion factor between degrees and radians.
class Ball:
def __init__(self):
self.pos = pygame.Vector2(random.randint(BALL_RADIUS, screen_width-BALL_RADIUS),
random.randint(BALL_RADIUS, screen_height-BALL_RADIUS))
self.vel = pygame.Vector2(random.uniform(-3, 3), random.uniform(-3, 3))
def update(self):
self.vel.y += GRAVITY
self.pos += self.vel
# Check for collisions with borders
if self.pos.x - BALL_RADIUS <= 0 or self.pos.x + BALL_RADIUS >= screen_width:
self.vel.x *= -FRIC_COEFFICIENT
if self.pos.x - BALL_RADIUS <= 0:
self.pos.x = BALL_RADIUS
elif self.pos.x + BALL_RADIUS >= screen_width:
self.pos.x = screen_width - BALL_RADIUS
if self.pos.y - BALL_RADIUS <= 0 or self.pos.y + BALL_RADIUS >= screen_height:
self.vel.y *= -FRIC_COEFFICIENT
if self.pos.y - BALL_RADIUS <= 0:
self.pos.y = BALL_RADIUS
elif self.pos.y + BALL_RADIUS >= screen_height:
self.pos.y = screen_height - BALL_RADIUS
def draw(self, win):
pygame.draw.circle(win, RED, (int(self.pos.x), int(self.pos.y)), BALL_RADIUS)
def get_hexagon_points(center, radius, angle_degrees):
"""Generate points of a hexagon."""
points = []
for i in range(6):
theta = (math.pi * 2) / 6 * i + angle_degrees * DEG_TO_RAD
x = center.x + radius * math.cos(theta)
y = center.y + radius * math.sin(theta)
points.append(pygame.Vector2(x, y))
return points
def check_bounce(ball, poly_points):
"""Check if the ball has hit any of the edges and bounce off appropriately."""
p1, p2, p3, p4, p5, p6 = poly_points
for start_point, end_point in [(p1, p2), (p2, p3), (p3, p4), (p4, p5), (p5, p6), (p6, p1)]:
line_vec = end_point - start_point
normal_vec = pygame.Vector2(-line_vec.y, line_vec.x).normalize() # Normal vector to edge
# Line equation parameters: Ax + By + C = 0
A = normal_vec.y
B = -normal_vec.x
C = -(A * start_point.x + B * start_point.y)
# Point-line-distance formula
distance = A * ball.pos.x + B * ball.pos.y + C
distance /= math.sqrt(A**2 + B**2)
if abs(distance) < BALL_RADIUS:
# Ball is hitting this edge.
dist_vec = ball.pos - (start_point + end_point) / 2
dot_product = dist_vec.dot(normal_vec)
reflection_vec = ball.vel.reflect(normal_vec)
if dot_product < 0 and reflection_vec.x * A + reflection_vec.y * B > 0:
# Reflect and apply some slowdown using friction coefficient.
ball.vel = reflection_vec * FRIC_COEFFICIENT
def redraw_window(win, balls, poly_points):
win.fill(WHITE)
pygame.draw.polygon(win, (0, 0, 255), [(p.x, p.y) for p in poly_points])
for ball in balls:
ball.draw(win)
pygame.display.update()
def main():
run = True
clock = pygame.time.Clock()
balls = [Ball() for _ in range(10)]
hexagon_center = pygame.Vector2(screen_width // 2, screen_height // 2)
hexagon_radius = screen_height // 4
angle_degrees = 0
while run:
dt = clock.tick(FPS) / 1000 # time elapsed since last frame in seconds
for event in pygame.event.get():
if event.type == pygame.QUIT:
run = False
# Spin the hexagon by increasing the angle
angle_degrees = (angle_degrees + 1 * dt * MAX_ANGLE_DEGREES) % MAX_ANGLE_DEGREES
poly_points = get_hexagon_points(hexagon_center, hexagon_radius, angle_degrees)
# Update each ball and check for collision with hexagon walls.
for ball in balls:
ball.update()
check_bounce(ball, poly_points)
redraw_window(win, balls, poly_points)
pygame.quit()
if __name__ == "__main__":
main()
{
"api_location": "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct"
}
Qwen2.5-14B-指导

import pygame
import sys
import math
import random
# Initialize Pygame
pygame.init()
# Screen dimensions
SCREEN_WIDTH = 800
SCREEN_HEIGHT = 600
screen = pygame.display.set_mode((SCREEN_WIDTH, SCREEN_HEIGHT))
pygame.display.set_caption("Bouncing Balls in Spinning Hexagon")
# Colors
WHITE = (255, 255, 255)
BLACK = (0, 0, 0)
RED = (255, 0, 0)
# Ball properties
BALL_RADIUS = 10
BALL_COUNT = 10
balls = []
# Hexagon properties
HEX_SIDES = 6
HEX_RADIUS = 200
HEX_CENTER = (SCREEN_WIDTH // 2, SCREEN_HEIGHT // 2)
HEX_SPIN_SPEED = 0.01
# Gravity and friction
GRAVITY = 0.5
FRICTION = 0.99
def create_balls():
for _ in range(BALL_COUNT):
x = SCREEN_WIDTH // 2 + (random.randint(-100, 100))
y = SCREEN_HEIGHT // 2 + (random.randint(-100, 100))
vx = random.uniform(-3, 3)
vy = random.uniform(-3, 3)
balls.append([x, y, vx, vy])
def draw_hexagon(angle):
points = []
for i in range(HEX_SIDES):
angle_rad = math.radians(i * 60 - angle)
x = HEX_CENTER[0] + HEX_RADIUS * math.cos(angle_rad)
y = HEX_CENTER[1] + HEX_RADIUS * math.sin(angle_rad)
points.append((int(x), int(y)))
return points
def check_collision(ball, points):
px, py = ball[:2]
for i in range(len(points)):
j = (i + 1) % len(points)
if is_point_in_triangle(px, py, points[i], points[j], HEX_CENTER):
return True
return False
def is_point_in_triangle(px, py, p1, p2, p3):
def sign(p1, p2, p3):
return (p1[0] - p3[0]) * (p2[1] - p3[1]) - (p2[0] - p3[0]) * (p1[1] - p3[1])
b1 = sign((px, py), p1, p2) < 0
b2 = sign((px, py), p2, p3) < 0
b3 = sign((px, py), p3, p1) < 0
return ((b1 == b2) and (b2 == b3))
def update_ball(ball, points):
x, y, vx, vy = ball
x += vx
y += vy
vy += GRAVITY
if check_collision(ball, points):
# Bounce logic
normal_vector = get_normal_vector(ball, points)
reflect_vector(normal_vector, ball)
# Apply friction
vx *= FRICTION
vy *= FRICTION
ball[0] = x
ball[1] = y
ball[2] = vx
ball[3] = vy
def get_normal_vector(ball, points):
px, py = ball[:2]
closest_point = min(points, key=lambda point: math.hypot(point[0] - px, point[1] - py))
dx, dy = closest_point[0] - px, closest_point[1] - py
length = math.sqrt(dx**2 + dy**2)
return [dx / length, dy / length]
def reflect_vector(normal, ball):
px, py, vx, vy = ball
dot_product = vx * normal[0] + vy * normal[1]
reflected_vx = vx - 2 * dot_product * normal[0]
reflected_vy = vy - 2 * dot_product * normal[1]
ball[2] = reflected_vx
ball[3] = reflected_vy
def main():
clock = pygame.time.Clock()
angle = 0
create_balls()
while True:
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
sys.exit()
screen.fill(WHITE)
# Draw the hexagon
points = draw_hexagon(angle)
pygame.draw.polygon(screen, BLACK, points)
# Update and draw balls
for ball in balls:
update_ball(ball, points)
pygame.draw.circle(screen, RED, (int(ball[0]), int(ball[1])), BALL_RADIUS)
angle += HEX_SPIN_SPEED
pygame.display.flip()
clock.tick(60)
if __name__ == "__main__":
main()
{
"parameters": {
"temperature": 0.6,
"top_p": 0.8,
"do_sample": true
}
}
1. 零度范式
传统训练推理模型的方法主要依赖于人类精心挑选的数据:
监督微调(SFT)需要包含人工撰写的查询、理由和答案的数据集。
带有可验证奖励的强化学习(RLVR)仍然需要人工标注的任务和答案,即使模型能生成自己的推理过程。
绝对零模型消除了对人类数据和任务依赖。模型同时提出、解决任务,并通过自我对弈从两个阶段中学习。如图 1 所示,智能体自主生成任务,优化可学习性,并通过统一模型学习解决任务。
智能代理 π 承担着双重角色:作为提议者 π propose 提出任务 τ,并作为求解者 π solve 生成答案 y。外部环境 e 负责验证这些提议,转化为 (x, y★) 的样本对,同时提供学习奖励 r propose 和解决方案奖励 r solve 。这种机制在无需人工筛选数据的情况下,实现了持续自我优化。

绝对零范式。监督学习依赖人工整理的推理轨迹来克隆行为。通过验证奖励的强化学习,使智能体能够自主学习推理,但仍依赖专家定义的学习分布和精心挑选的问答对,这需要专业知识和手动操作。相比之下,我们引入了一种新的范式,Absolute Zero,用于在没有任何人工整理的数据的情况下训练推理模型。 我们设想该智能体能够自主提出针对可学习性的任务,并通过统一模型学习如何解决这些任务。该智能体通过与提供可验证反馈的环境互动来学习,从而实现完全无需人工干预的可靠且持续的自我改进。
2. 零度推理者
绝对零推理器(AZR)是我们首次实现的绝对零范式。AZR 采用统一的语言模型,通过代码推理挑战进行学习。该模型通过持续自我改进,无需人类数据。
2.1. 角色提出与解决
绝对零推理者采用了一种统一的模型,该模型承担着两个互补的角色:
提议者角色:生成具有高学习潜能的任务,既不过于简单,也不太难。模型因创建提供有意义学习机会的挑战而获得奖励。
解决者角色:尝试解决提议的问题,根据解决方案的正确性获得二进制奖励,通过 Python 执行验证。
对于提议者,我们设计了一个基于蒙特卡洛抽样的奖励函数,以鼓励生成难度最优的任务——即那些有时成功有时失败的问题。这为持续改进提供了最丰富的学习信号。
2.2 逻辑思维模式
如图 3 所示,绝对零推理器支持三种基本推理模式,每种模式都关注三元素(程序、输入、输出)的不同方面:
推断:根据程序和输入预测输出,并逐步记录逻辑推理。这测试了模型追踪程序执行的能力。
诱导:根据程序及其输出推断合理输入,类似于试错或搜索过程。测试模型从结果反向推导的能力。
归纳:通过输入输出示例生成程序,需要从部分信息中进行泛化。这测试了模型发现潜在模式的能力。
模型从少量种子示例(例如恒等函数)开始,通过持续的自我对弈和验证,逐步发展出越来越复杂的推理能力。

绝对零推理器的训练概览。在每次迭代中,绝对零推理器首先根据存储在缓冲区中的过去自生成的三元组以及特定任务类型(包括演绎、推理或归纳),提出一批任务。然后,使用 Python 过滤和构建有效的代码推理问题。每个提议的任务还计算了可学习性奖励
2.3. 零度推理算法

3. 结果
3.1 主要成果
Model | Base | #data | HEval+ | MBPP+ | LCBv5 | AME24 | AME25 | AMC | M500 | Minva | Olypiad | CAvg | MAvg | AVG |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
基础模型 | ||||||||||||||
Qwen2.5-7B | - | - | 73.2 | 65.3 | 17.5 | 6.7 | 3.3 | 37.5 | 64.8 | 25.0 | 27.7 | 52.0 | 27.5 | 39.8 |
Qwen2.5-7B-Ins | - | - | 75.0 | 68.5 | 25.5 | 13.3 | 6.7 | 52.5 | 76.4 | 35.7 | 37.6 | 56.3 | 37.0 | 46.7 |
Qwen2.5-7B-Coder | - | - | 80.5 | 69.3 | 19.9 | 6.7 | 3.3 | 40.0 | 54.0 | 17.3 | 21.9 | 56.6 | 23.9 | 40.2 |
Qwen2.5-7B-Math | - | - | 61.0 | 57.9 | 16.2 | 10.0 | 16.7 | 42.5 | 64.2 | 15.4 | 28.0 | 45.0 | 29.5 | 37.3 |
基于精选编码数据训练的零风格推理模型 |
||||||||||||||
AceCoder-RM | Ins | 22k | 79.9 | 71.4 | 23.6 | 20.0 | 6.7 | 50.0 | 76.4 | 34.6 | 36.7 | 58.3 | 37.4 | 47.9 |
AceCoder-Rule | Ins | 22k | 77.4 | 69.0 | 19.9 | 13.3 | 6.7 | 50.0 | 76.0 | 37.5 | 37.8 | 55.4 | 36.9 | 46.2 |
AceCoder-RM | Coder | 22k | 78.0 | 66.4 | 27.5 | 13.3 | 3.3 | 27.5 | 62.6 | 29.4 | 29.0 | 57.3 | 27.5 | 42.4 |
AceCoder-Rule | Coder | 22k | 80.5 | 70.4 | 29.0 | 6.7 | 6.7 | 40.0 | 62.8 | 27.6 | 27.4 | 60.0 | 28.5 | 44.3 |
CodeR1-LC2k | Ins | 2k | 81.7 | 71.7 | 28.1 | 13.3 | 10.0 | 45.0 | 75.0 | 33.5 | 36.7 | 60.5 | 35.6 | 48.0 |
CodeR1-12k | Ins | 12k | 81.1 | 73.5 | 29.3 | 13.3 | 3.3 | 37.5 | 74.0 | 35.7 | 36.9 | 61.3 | 33.5 | 47.4 |
基于精选数学数据训练的零风格推理者 |
||||||||||||||
PRIME-Zero | Coder | 484k | 49.4 | 51.1 | 11.0 | 23.3 | 23.3 | 67.5 | 81.2 | 37.9 | 41.8 | 37.2 | 45.8 | 41.5 |
SimpleRL-Zoo | Base | 8.5k | 73.2 | 63.2 | 25.6 | 16.7 | 3.3 | 57.5 | 77.0 | 35.7 | 41.0 | 54.0 | 38.5 | 46.3 |
Oat-Zero | Math | 8.5k | 62.2 | 59.0 | 15.2 | 30.0 | 16.7 | 62.5 | 80.0 | 34.9 | 41.6 | 45.5 | 44.3 | 44.9 |
ORZ | Base | 57k | 80.5 | 64.3 | 22.0 | 13.3 | 16.7 | 60.0 | 81.8 | 32.7 | 45.0 | 55.6 | 41.6 | 48.6 |
无需精选数据的绝对零训练(我们的方法) |
||||||||||||||
AZR (Ours) | Base | 0 | 71.3 -1.9 | 69.1 +3.8 | 25.3 +7.8 | 13.3 +6.6 | 13.3 +10.0 | 52.5 +15.0 | 74.4 +9.6 | 38.2 +13.2 | 38.5 +10.8 | 55.2 +3.2 | 38.4 +10.9 | 46.8 +7.0 |
AZR (Ours) | Coder | 0 | 83.5 +3.0 | 69.6 +0.3 | 31.7 +11.8 | 20.0 +13.3 | 10.0 +6.7 | 57.5 +17.5 | 72.6 +22.6 | 36.4 +19.1 | 38.2 +16.3 | 61.6 +5.0 | 39.1 +15.2 | 50.4 +10.2 |
基于 Qwen2.5-7B 模型的 RL 训练推理器在三个标准代码基准测试(HumanEval + , MBPP + , LCB v5)和六个数学基准测试(AIME'24, AIME'25, AMC'23, MATH500, Minerva, OlympiadBench)上的表现。三个代码基准测试和六个数学基准测试的平均表现计算为两个平均值的平均值:AVG = (CAvg + MAvg) / 2. 我们用 + 表示相对于基础模型的绝对百分比增幅。 所有模型都是使用不同版本的 Qwen2.5-7B 模型训练的,并标注了模型和数据的使用情况。
3.2. 扩展结果
模型系列 | Variant | Code Avg | Math Avg | Total Avg |
---|---|---|---|---|
Qwen2.5-3B Coder | 51.2 | 18.8 | 35.0 | |
Qwen2.5-3B Coder | + AZR(我们的) | 54.9 +3.7 | 26.5 +7.7 | 40.7 +5.7 |
Qwen2.5-7B Coder | 56.6 | 23.9 | 40.2 | |
Qwen2.5-7B Coder | + AZR(我们的) | 61.6 +5.0 | 39.1 +15.2 | 50.4 +10.2 |
Qwen2.5-14B Coder | 60.0 | 20.2 | 40.1 | |
Qwen2.5-14B Coder | + AZR(我们的) | 63.6 +3.6 | 43.0 +22.8 | 53.3 +13.2 |
不同模型大小下的分布外推理性能,以代码任务、数学任务及整体平均值的平均值来衡量。我们研究了模型规模从 3B 参数扩展到 14B 参数的影响。
鉴于编码器模型在 7B 类别中表现出色,我们通过评估较小和大模型来扩展分析: Qwen2.5-3B-Coder
和 Qwen2.5-14B-Coder
。由于这些零模型没有现成的基线,我们比较每个模型与其基准编码模型的性能。
结果显示了一个明显的趋势:我们的方法在大模型上取得了更大的提升。在分布内设置中,7B 和 14B 模型在 200 训练步骤后继续提升,而较小的 3B 模型则趋于平稳。在分布外域中,较大的模型也比较小的模型表现出更大的整体性能提升:3B、7B 和 14B 模型的整体性能提升分别为+5.7、+10.2 和+13.2。 这是一个令人鼓舞的迹象,表明扩展提高了 AZR 的有效性。在未来的研究中,我们计划研究在绝对零范式中性能的扩展规律。
3.3. 其他重要发现
代码能力先验会增强推理。基础Qwen-Coder-7b
模型的数学成绩比Qwen-7b
低 3.6 分。但在经过 AZR 训练后,编码变体比基础模型高出 0.7 分,表明强大的编码能力可能增强了整体推理能力的提升。
在跨领域迁移能力方面,AZR 表现更为突出。在 RLVR 之后,专家代码模型的数学准确度仅提高了 0.65 个百分点,而AZR-Base-7B
和AZR-Coder-7B
在自行提出的代码推理任务中,分别提高了 10.9 和 15.2 个百分点,这表明它们在通用推理能力方面有了显著提升。
当中间计划出现时,注释自然产生。在解决代码归纳任务时,AZR 通常会将逐步计划融入注释和代码中(见图 4),类似于 ReAct 提示框架。类似的行为也出现在更大的形式数学模型中,如 DeepSeek Prover v2(671B)。因此,我们相信,允许模型在生成长回答时使用中间暂存区,可能对其他领域也有益。
认知行为和令牌长度取决于推理模式。通过 AZR 训练,逐步推理、枚举和试错等认知行为显现出来。不同类型的任务中,试错行为在诱导任务中尤为明显,如图 5 所示。 此外,在 AZR 训练过程中,token 数量会增加,但增幅因任务类型而异:abduction 增长最多,因为模型会进行试错直到输出匹配;而演绎和归纳的增长则较小。
安全警报响起。我们观察到 AZR 以Llama3.1-8b
为基底时,偶尔会出现令人担忧的思维链,我们称之为 "uh-oh 时刻",示例见图 6,这突显了未来在安全感知训练方面工作的必要性。

注释作为中间计划的示例。模型在解决复杂推理任务时,会自然形成使用注释作为中间步骤的习惯,类似于 ReAct 提示框架。这种新出现的行为展示了模型如何通过自我注释来分解问题为可管理的步骤。

模型提出的任务及其响应示例,用于解决绑架任务。(左)模型自主提出绑架任务的输入和程序。我们执行该程序以验证其有效性并获取相应输出。(右)模型在解决绑架任务时的推理过程:在给定程序和输出后,尝试推断原始输入。模型首先分析程序,提出初始输入,并通过代码推理得出输出。 如果存在不匹配,系统会检查并迭代调整输入,直到生成的目标输出与目标一致。有趣的是,代理生成的输入虽然与正确输入不同,但因为它能生成正确的输出,所以答案被视为正确。

在 AZR 训练中 "Uh-Oh 时刻" 的示例。使用 Llama3.1-8b 作为基础模型时,我们偶尔会在推理过程中观察到令人担忧的思维过程。这个例子突显了在绝对零范式未来的迭代中,需要进行安全训练。
4. 引用的来源
@misc{zhao2025absolutezeroreinforcedselfplay,
title={Absolute Zero: Reinforced Self-play Reasoning with Zero Data},
author={Andrew Zhao and Yiran Wu and Yang Yue and Tong Wu and Quentin Xu and Yang Yue and Matthieu Lin and Shenzhi Wang and Qingyun Wu and Zilong Zheng and Gao Huang},
year={2025},
eprint={2505.03335},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2505.03335},
}