Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
120 commits
Select commit Hold shift + click to select a range
b44c854
feat(orchestrator): add orchestrator v2 with shared train+eval scheduler
mikasenghaas May 26, 2026
a3917c2
refactor(orchestrator_v2): address review feedback
mikasenghaas May 26, 2026
2609b01
refactor(orchestrator_v2): decompose batcher into single-purpose comp…
mikasenghaas May 26, 2026
aeecbbd
refactor(orchestrator_v2): rollout-as-atom + train/eval sinks with fl…
mikasenghaas May 27, 2026
766a407
refactor(orchestrator_v2): three-level sinks (rollout / group / batch)
mikasenghaas May 27, 2026
d2ef0f5
refactor(orchestrator_v2): unify is_batch_done trigger across train a…
mikasenghaas May 27, 2026
7274ab6
chore(configs): replace orch_v2 debug overlay with hendrycks_math sho…
mikasenghaas May 27, 2026
6bc11e1
refactor(orchestrator_v2/ckpt): drop unused last_eval_step from Progress
mikasenghaas May 27, 2026
d3b70cd
refactor(orchestrator_v2): drop pre-baked metrics from batch result t…
mikasenghaas May 27, 2026
75da0a3
refactor(orchestrator_v2): consolidate dataclasses + types into types.py
mikasenghaas May 27, 2026
685c24a
chore(configs): drop max_completion_tokens cap from orch_v2_demo
mikasenghaas May 27, 2026
15953cb
refactor(orchestrator_v2): move group/batch boundary signals into the…
mikasenghaas May 27, 2026
649ab6a
fix(configs): allow matching shared+sub values in propagator conflict…
mikasenghaas May 27, 2026
36f7105
refactor(orchestrator): replace legacy orchestrator with v2 implement…
mikasenghaas May 27, 2026
e812e0a
refactor(orchestrator): UUID group IDs, pool-owned client selection, …
mikasenghaas May 27, 2026
56f8cc1
revert: restore validation.py to main
mikasenghaas May 27, 2026
3730cd6
refactor(orchestrator): factor TrainSource + EvalSource out of dispat…
mikasenghaas May 27, 2026
7b2eacc
refactor(orchestrator): slim dispatcher, move eval triggers to per-ba…
mikasenghaas May 27, 2026
3c90b78
refactor(configs): move off-policy caps to per-side {train,eval}.max_…
mikasenghaas May 27, 2026
e447373
refactor(eval_sink): build metrics in process_batch + rename expected…
mikasenghaas May 27, 2026
5e76a5b
revert: collapse per-side off-policy caps back to global max_off_poli…
mikasenghaas May 27, 2026
c4f5634
Merge remote-tracking branch 'origin/main' into exp/orchestrator-v2
mikasenghaas May 27, 2026
d702343
Merge remote-tracking branch 'origin/main' into exp/orchestrator-v2
mikasenghaas May 27, 2026
f49dd80
chore(config): remove wandb.shared deprecation shim
mikasenghaas May 27, 2026
50e7ec7
fix(config): honor --no-wandb / --no-ckpt across shared and sub-configs
mikasenghaas May 27, 2026
df222bd
chore(changelog): drop --no-wandb fix entry
mikasenghaas May 27, 2026
4f8407e
chore(comment): trim bare-block enablement comment
mikasenghaas May 27, 2026
6279052
docs(wandb): drop legacy-mode reference; add AGENTS rule
mikasenghaas May 27, 2026
c7729a1
chore(agents): trim docs rule
mikasenghaas May 27, 2026
60766f6
Merge remote-tracking branch 'origin/chore/remove-wandb-shared-deprec…
mikasenghaas May 27, 2026
907a0a6
Merge remote-tracking branch 'origin/main' into exp/orchestrator-v2
mikasenghaas May 27, 2026
25517ce
refactor(eval_sink): drop redundant batch_arrivals counter
mikasenghaas May 27, 2026
329252d
refactor(sinks): symmetrize TrainSink + EvalSink API; non-optional Ev…
mikasenghaas May 27, 2026
0ef56ae
refactor(orchestrator): quiet filter per-group logs; add per-group su…
mikasenghaas May 27, 2026
6ae5f8d
refactor(rollout): key sinks by group_id UUID, not (env, example_id)
mikasenghaas May 27, 2026
205ee25
refactor(types): rename ProcessResult→TrainBatchMetrics, add typed Ev…
mikasenghaas May 27, 2026
f3a42fd
chore(filters): drop per-group detection log from apply_filters
mikasenghaas May 27, 2026
32719d0
refactor(periodic_logger): distribute per-component; no defensive wandb
mikasenghaas May 27, 2026
4717624
refactor(metrics): drop dispatcher gauges/drain from step-aligned Met…
mikasenghaas May 27, 2026
545d2df
chore: revert stray uv.lock edit to match origin/main
mikasenghaas May 27, 2026
e3f2326
refactor(orchestrator): tighten APIs + collapse dead state
mikasenghaas May 27, 2026
aedb452
fix(dispatcher): emit Cancelled markers for un-scheduled rollouts on …
mikasenghaas May 27, 2026
8bc6a4a
refactor(sources): unify next_example API + relax over-strict train p…
mikasenghaas May 27, 2026
1f59407
refactor(orchestrator): log overhaul + drain-cancel + multi-env support
mikasenghaas May 27, 2026
bef17a6
refactor(eval_source): unify trigger + trigger_at_start
mikasenghaas May 27, 2026
f91d21f
refactor(orchestrator): tighten attr typing, drop event_loop_lag, polish
mikasenghaas May 27, 2026
31398ee
refactor(orchestrator): consolidate periodic logs, per-env breakdown,…
mikasenghaas May 27, 2026
6b63884
fix(orchestrator): pipeline log per-env reconciliation + step-0 step …
mikasenghaas May 27, 2026
d3b0fcd
chore(orchestrator): move Error/Truncation to end of success log lines
mikasenghaas May 27, 2026
ddfca0c
chore(trainer): use format_time for per-step success log
mikasenghaas May 27, 2026
b35565c
fix(dispatcher): keep dispatching eval group tails after queue empties
mikasenghaas May 27, 2026
34f5a4f
feat(configs): rlm_swe qwen3-4b-thinking on 2-node slurm
mikasenghaas May 27, 2026
1f86c7e
fix(orchestrator): pipeline-view accounting + drain-switch overlap fi…
mikasenghaas May 27, 2026
8b21317
fix(sinks): pipeline view fills per-rollout for non-group-scoring envs
mikasenghaas May 27, 2026
5b7be19
chore(configs): drop hendrycks_math/sanity seq_len 8192 → 4096
mikasenghaas May 27, 2026
5617925
refactor(eval_sink): rename epoch_progress → batch_progress
mikasenghaas May 27, 2026
0b69dd4
chore(orchestrator): reformat pipeline log + drop avg@k wandb metric
mikasenghaas May 28, 2026
ed06dda
chore(configs): bump default LogConfig.interval 5s → 10s
mikasenghaas May 28, 2026
f1aa527
chore(orchestrator): reword train + eval success log prefixes
mikasenghaas May 28, 2026
fb78cee
chore(periodic_logger): drop name-column prefix from console emit
mikasenghaas May 28, 2026
4b6251e
chore(orchestrator): pipeline log puts batch progress before inflight
mikasenghaas May 28, 2026
601423f
chore(orchestrator): show batch-progress percentages in pipeline log
mikasenghaas May 28, 2026
2c86b48
chore(configs): hendrycks_math/sanity batch 512 → 256, keep max_infli…
mikasenghaas May 28, 2026
dcaa3a1
chore(orchestrator): drop /max from train-inflight pipeline log
mikasenghaas May 28, 2026
cd34551
chore(orchestrator): drop env list from eval-trigger mode-flip reason
mikasenghaas May 28, 2026
a601dd9
chore(configs): drop verbose comment on EvalConfig.validate_non_empty…
mikasenghaas May 28, 2026
7687dcf
chore(configs): drop verbose docstring on OrchestratorExperimentalConfig
mikasenghaas May 28, 2026
a16f35c
chore(configs): drop orch-v2 references from pre_batch_filters docstring
mikasenghaas May 28, 2026
2fa639b
chore(configs): trim verbose comment on train group_size propagation
mikasenghaas May 28, 2026
be50608
chore(configs): generic LogConfig.interval docstring
mikasenghaas May 28, 2026
c43363a
chore(eval_source): drop TrainSource cross-reference from docstring
mikasenghaas May 28, 2026
3af5c66
chore(filters): revert setup_filters docstring to one-liner
mikasenghaas May 28, 2026
1220034
refactor(orchestrator): typed rollout dataclasses, raw stays pristine
mikasenghaas May 28, 2026
bfc94ab
fix(trajectories): thread env_name as kwarg into interleave_rollout
mikasenghaas May 28, 2026
10ed23d
chore(types): drop ``_``-prefix from rollout metadata keys in to_dict
mikasenghaas May 28, 2026
31e71f9
feat(types): add FinishedRollout.rollout_id; fix advantage grouping
mikasenghaas May 28, 2026
e6316de
chore(orchestrator): reword eval success log 'Finished evaluating' ->…
mikasenghaas May 28, 2026
ffb913c
refactor(advantage): drop grouping logic, cache advantage_fn on the sink
mikasenghaas May 28, 2026
286601d
chore(orchestrator): drop Valid X/Y from eval success log (redundant …
mikasenghaas May 28, 2026
ae9d267
chore(orchestrator): bump per-env success-log indent by one space
mikasenghaas May 28, 2026
a158164
fix(orchestrator): drop_group orphan, empty-batch spin, resume eval dup
mikasenghaas May 28, 2026
4b04c91
chore(orchestrator): drop 'Starting orchestrator step N' info log
mikasenghaas May 28, 2026
e92345f
refactor(dispatcher): rename + slim down wandb metrics
mikasenghaas May 28, 2026
b7f0dee
chore(orchestrator): drop verbose attribute-schema comment block
mikasenghaas May 28, 2026
ffa6905
chore: trim excessive code comments + docstrings
mikasenghaas May 28, 2026
15bc7f1
refactor(types): trim types.py 255 → 212 lines
mikasenghaas May 28, 2026
62d1ba7
test: fix unit tests for typed rollouts + dropped modules
mikasenghaas May 28, 2026
80837b6
refactor(configs): drop obsolete buffer + cancel-inflight fields
mikasenghaas May 28, 2026
7d45c2d
refactor: read seq/completion lens from vf.RolloutOutput.token_usage
mikasenghaas May 28, 2026
893084d
fix(orchestrator): revert eval reward key to avg@N; migrate example b…
mikasenghaas May 28, 2026
c97c86c
Merge remote-tracking branch 'origin/main' into exp/orchestrator-v2
mikasenghaas May 28, 2026
018861f
feat(configs): warn + migration guide for removed [orchestrator.buffer]
mikasenghaas May 28, 2026
be0a498
chore(configs): restore unchanged docstrings/comments to minimize diff
mikasenghaas May 28, 2026
bc235d2
feat(rlm-swe): single-node qwen3.5-4b config; pin rlm-swe as workspac…
mikasenghaas May 28, 2026
56e8926
feat(rlm-swe): qwen3.5-4b config contents + workspace install for rlm…
mikasenghaas May 28, 2026
a0e1a2e
fix: eval metric correctness + integration-test log parsing
mikasenghaas May 28, 2026
aeac9d5
refactor(orchestrator): pipeline-log clarity, per-rollout offload, li…
mikasenghaas May 28, 2026
6684fc6
feat(rlm-swe): multinode 1 train + 1 infer node, inference dp=8
mikasenghaas May 28, 2026
30e6a9a
feat(rlm-swe): add 1h per-rollout timeout on swebench eval
mikasenghaas May 28, 2026
288ddfc
fix(rlm-swe): cp=4 to fix trainer OOM on step 1
mikasenghaas May 29, 2026
303cd9d
chore: update uv.lock to match pyproject (fix `uv sync --locked` in CI)
mikasenghaas May 29, 2026
9214d62
refactor(trainer): match orchestrator log format (drop colons after l…
mikasenghaas May 29, 2026
866415d
refactor(trainer): drop "Time" label from step line (match orch)
mikasenghaas May 29, 2026
c03b27f
feat(rlm-swe): 2 inference nodes (dp=16) to double decode throughput
mikasenghaas May 29, 2026
6aa4d92
fix(rlm-swe): inference dp is per-node — set dp=8 (not 16)
mikasenghaas May 29, 2026
516abcc
feat(rlm-swe): 2 independent inference replicas instead of cross-node DP
mikasenghaas May 29, 2026
d6e4f34
fix(orchestrator): warn (not success) when cleanup is forced
mikasenghaas May 29, 2026
d07b387
Merge remote-tracking branch 'origin/main' into exp/orchestrator-v2
mikasenghaas May 29, 2026
ff4f74f
chore(orchestrator): remove dead print_benchmark from utils
mikasenghaas May 29, 2026
11be6fc
fix(orchestrator): eval the student (not teacher) in SFT; ckpt drain …
mikasenghaas May 29, 2026
7fa8384
fix(orchestrator): per-env group_size in solve rates; group by group_id
mikasenghaas May 29, 2026
826a3b4
chore(configs): consolidate multimodal configs under configs/debug
mikasenghaas May 29, 2026
7619727
fix(orchestrator): eval through the eval (chat) client, not the renderer
mikasenghaas May 29, 2026
0f5da88
test(reverse_text): lower min reward threshold 0.65 -> 0.6
mikasenghaas May 29, 2026
59817d6
fix(orchestrator): restore checkpoint resume compat with pre-rewrite …
mikasenghaas May 29, 2026
fe123b2
fix(orchestrator): gate dispatcher on lead instead of blocking ship
mikasenghaas May 30, 2026
a562280
fix(dispatcher): claim drop_group tasks atomically before emitting
mikasenghaas May 30, 2026
8ce8d9b
feat(rlm-swe): enable prefix caching + language-model-only inference
mikasenghaas May 30, 2026
6706bb7
feat(rlm-swe): disable thinking in the qwen3.5 renderer
mikasenghaas May 30, 2026
9963b88
feat(rlm-swe): bump main run to 400 steps (non-thinking)
mikasenghaas May 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

Documenting **breaking** configuration changes — renamed, removed, or moved fields that require users to update existing configs.

- **Orchestrator async-pipeline rewrite** (collection of removals/renames). The orchestrator was rewritten to overlap train/eval rollouts on a shared concurrency limiter; several config fields were removed or renamed.
- **`orchestrator.seed` removed**: was only consumed by the deleted buffer; no replacement.
- **`orchestrator.eval.eval_base_model` → `orchestrator.eval.skip_first_step`** (semantics inverted): `eval_base_model = true` becomes `skip_first_step = false` (the default — run the step-0 eval before any train rollouts). No alias; configs setting `eval_base_model` must rename.
- **`orchestrator.eval.skip_eval_on_resume` (and its alias `skip_eval_on_restart`) removed**: folded into `skip_first_step`. Resume no longer re-fires an already-completed eval (deduped via per-env last-eval-step tracking in the checkpointed progress).
- **`orchestrator.eval.cancel_inflight_rollouts_on_eval` removed**: the drain-switch overlap (stop scheduling new train, let in-flight train drain while eval queues) is now the only eval transition mode.
- **`ckpt.skip_buffer` removed**: there is no buffer to skip.
- **`[orchestrator.buffer]` removed** (difficulty pools): the whole block and every key (`seed`, `easy_threshold`, `hard_threshold`, `easy_fraction`, `hard_fraction`, `online_difficulty_filtering`, `hash_keys`) are gone. A before-validator drops the block and emits a `FutureWarning` (so old configs still parse). To preserve `online_difficulty_filtering = true`, enforce the zero-advantage pre-batch filter: `[[orchestrator.pre_batch_filters]]\ntype = "zero_advantage"\nenforce = true`.
- **`orchestrator.filters` → `orchestrator.post_batch_filters`** (backward compatible): a before-validator aliases `filters`, so existing TOML/CLI keep parsing. New configs should use `post_batch_filters` (and the new `pre_batch_filters`). Filters are **train-only** now — eval rollouts are no longer filtered.
- **`orchestrator.max_off_policy_steps` now also applies to eval** (behavior change, field unchanged): eval rollouts that fall more than `max_off_policy_steps` versions behind the policy are cancelled, same as train. (2026-05-29)
- **`sampling.min_tokens`, `sampling.repetition_penalty`, `sampling.seed` removed**: Dropped from both `TrainSamplingConfig` and `EvalSamplingConfig` (group-level `[orchestrator.train.sampling]` / `[orchestrator.eval.sampling]` and per-env `[[orchestrator.train.env.sampling]]` / `[[orchestrator.eval.env.sampling]]`). `min_tokens` suppressed natural EOS, `repetition_penalty` distorts the on-policy sampling distribution, and `seed` wasn't pulling its weight — none belonged on the supported config surface. Existing configs setting any of these must delete the field. Hard-deprecation, no migration window. (2026-05-27)
- **`wandb.shared` removed**: The deprecation shim that popped `wandb.shared` from input dicts with a `FutureWarning` (introduced in #2649) is gone. The `rl` entrypoint always uses shared W&B mode now, and existing configs that still set `wandb.shared = true` (or `false`) will fail validation. Drop the field from your config. (2026-05-27)
- **`max_async_level` and `strict_async_level` removed**: The async-execution semantics between trainer and orchestrator are now design invariants, not config knobs. The trainer always runs exactly one step ahead of inference, and the orchestrator always adopts the freshest checkpoint that doesn't violate the one-step barrier. The shared top-level `max_async_level`, the per-sub-config `trainer.max_async_level` / `orchestrator.max_async_level`, and `orchestrator.strict_async_level` have all been removed. Existing configs setting any of these must drop the field; the previous defaults (`max_async_level = 1`, `strict_async_level = false`) match the new hardcoded behavior. Bench mode no longer bypasses the weight-ckpt wait (the `int(1e9)` workaround is gone) and `multimodal/rl_color_codeword_feat_renderer.toml`'s prior `max_async_level = 0` (fully synchronous on-policy) is no longer expressible. (2026-05-25)
Expand Down
2 changes: 1 addition & 1 deletion configs/ci/integration/alphabet_sort.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
max_steps = 5
max_steps = 10
seq_len = 2048

[ckpt]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
max_steps = 3000
seq_len = 8192
seq_len = 4096

[wandb]
project = "hendrycks-math-debug"
Expand All @@ -9,7 +9,8 @@ name = "hendrycks-math-sanity"
name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

[orchestrator]
batch_size = 512
batch_size = 256
max_inflight_rollouts = 512
group_size = 8

[[orchestrator.train.env]]
Expand All @@ -30,7 +31,7 @@ group_size = 16
[trainer.model.compile]

[inference.model]
max_model_len = 8192
max_model_len = 4096

[log]
level = "debug"
Expand Down
52 changes: 52 additions & 0 deletions configs/debug/multi_env/reverse_text.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
max_steps = 20
seq_len = 2048

[model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"

[wandb]
project = "reverse-text-debug"
name = "debug-multi-env"

[orchestrator]
training_mode = "rl"
batch_size = 128
group_size = 16

[orchestrator.renderer]
name = "qwen3"

# -- multi train envs --

[[orchestrator.train.env]]
id = "reverse-text"
name = "reverse-text-train-1"

[[orchestrator.train.env]]
id = "reverse-text"
name = "reverse-text-train-2"

# -- multi eval envs --

[orchestrator.eval]
interval = 10
num_examples = 16
group_size = 4

[[orchestrator.eval.env]]
id = "reverse-text"
name = "reverse-text-eval-1"

[[orchestrator.eval.env]]
id = "reverse-text"
name = "reverse-text-eval-2"
interval = 5

[trainer.optim]
lr = 3e-6

[inference]
gpu_memory_utilization = 0.5

[inference.model]
max_model_len = 128
75 changes: 75 additions & 0 deletions configs/debug/multimodal.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# 2-GPU debug RL run for the multimodal (renderer) path: Qwen3-VL-4B on
# color-codeword. Sized to actually learn (reward should trend up) while
# staying 2-GPU friendly. Exercises RendererClient + Qwen3VLRenderer end-to-end.

max_steps = 15
seq_len = 4096

[model]
name = "Qwen/Qwen3-VL-4B-Instruct"

[model.vlm]
vision_encoder_attr = "model.visual"
language_model_attr = "model.language_model"

[deployment]
num_train_gpus = 1
num_infer_gpus = 1
gpus_per_node = 2

[orchestrator]
batch_size = 256
group_size = 16
# Image processor is CPU-bound and dominates for VLMs; returns diminish past 4.
pool_size = 4

# Step 0 on Qwen3-VL-4B vs color-codeword can be uniform (all-correct or
# all-wrong), so don't enforce zero-advantage dropping or training would crash
# before any progress.
[[orchestrator.filters]]
type = "gibberish"

[[orchestrator.filters]]
type = "repetition"

[[orchestrator.filters]]
type = "zero_advantage"
enforce = false

[orchestrator.train.sampling]
max_completion_tokens = 64

[[orchestrator.train.env]]
id = "color-codeword"
args = { images_per_turn = 1, max_turns = 3, num_examples = 1000, seed = 42 }

# Default renderer (AutoRendererConfig) resolves Qwen3-VL-4B-Instruct from
# MODEL_RENDERER_MAP to Qwen3VLRenderer at runtime; no explicit name needed.

[trainer]

[trainer.model]
optimization_dtype = "bfloat16"
reduce_dtype = "bfloat16"

[trainer.optim]
lr = 3e-6

[inference]

[inference.model]
# Workaround for vLLM 0.20.1 Qwen3-VL deepstack buffer bug: when num_scheduled_tokens
# (188) gets padded up to the next cudagraph_capture_size (192), the model's
# _set_deepstack_input_embeds sizes the buffer to 188 but forward() runs with 192,
# triggering "Requested more deepstack tokens than available in buffer". Eager mode
# skips the padding so num_input_tokens == num_scheduled_tokens.
enforce_eager = true

[inference.parallel]
dp = 1
tp = 1

[wandb]
project = "debug"
name = "multimodal"
tags = ["qwen3vl-4b", "color-codeword", "renderer"]
4 changes: 0 additions & 4 deletions configs/hendrycks_math/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ id = "math-env"
name = "hendrycks-math"
args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "default", math_verify_max_workers = 128, math_verify_timeout = 60 }

[orchestrator.buffer]
easy_threshold = 1.0
hard_threshold = 0.0

[orchestrator.eval]
interval = 10

Expand Down
4 changes: 0 additions & 4 deletions configs/math_group/rl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ name = "acereason-math"
args = { dataset_name = "nvidia/AceReason-Math", dataset_subset = "default", question_key = "problem" }
ratio = 0.5

[orchestrator.buffer]
easy_threshold = 1.0
hard_threshold = 0.0

[orchestrator.eval]
interval = 50

Expand Down
59 changes: 0 additions & 59 deletions configs/multi_reverse_text/rl.toml

This file was deleted.

34 changes: 0 additions & 34 deletions configs/multimodal/rl_color_codeword.toml

This file was deleted.

94 changes: 0 additions & 94 deletions configs/multimodal/rl_color_codeword_feat_renderer.toml

This file was deleted.

Loading
Loading