PrimeIntellect-ai · mikasenghaas · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 27, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,15 @@
 
 Documenting **breaking** configuration changes — renamed, removed, or moved fields that require users to update existing configs.
 
+- **Orchestrator async-pipeline rewrite** (collection of removals/renames). The orchestrator was rewritten to overlap train/eval rollouts on a shared concurrency limiter; several config fields were removed or renamed.
+  - **`orchestrator.seed` removed**: was only consumed by the deleted buffer; no replacement.
+  - **`orchestrator.eval.eval_base_model` → `orchestrator.eval.skip_first_step`** (semantics inverted): `eval_base_model = true` becomes `skip_first_step = false` (the default — run the step-0 eval before any train rollouts). No alias; configs setting `eval_base_model` must rename.
+  - **`orchestrator.eval.skip_eval_on_resume` (and its alias `skip_eval_on_restart`) removed**: folded into `skip_first_step`. Resume no longer re-fires an already-completed eval (deduped via per-env last-eval-step tracking in the checkpointed progress).
+  - **`orchestrator.eval.cancel_inflight_rollouts_on_eval` removed**: the drain-switch overlap (stop scheduling new train, let in-flight train drain while eval queues) is now the only eval transition mode.
+  - **`ckpt.skip_buffer` removed**: there is no buffer to skip.
+  - **`[orchestrator.buffer]` removed** (difficulty pools): the whole block and every key (`seed`, `easy_threshold`, `hard_threshold`, `easy_fraction`, `hard_fraction`, `online_difficulty_filtering`, `hash_keys`) are gone. A before-validator drops the block and emits a `FutureWarning` (so old configs still parse). To preserve `online_difficulty_filtering = true`, enforce the zero-advantage pre-batch filter: `[[orchestrator.pre_batch_filters]]\ntype = "zero_advantage"\nenforce = true`.
+  - **`orchestrator.filters` → `orchestrator.post_batch_filters`** (backward compatible): a before-validator aliases `filters`, so existing TOML/CLI keep parsing. New configs should use `post_batch_filters` (and the new `pre_batch_filters`). Filters are **train-only** now — eval rollouts are no longer filtered.
+  - **`orchestrator.max_off_policy_steps` now also applies to eval** (behavior change, field unchanged): eval rollouts that fall more than `max_off_policy_steps` versions behind the policy are cancelled, same as train. (2026-05-29)
 - **`sampling.min_tokens`, `sampling.repetition_penalty`, `sampling.seed` removed**: Dropped from both `TrainSamplingConfig` and `EvalSamplingConfig` (group-level `[orchestrator.train.sampling]` / `[orchestrator.eval.sampling]` and per-env `[[orchestrator.train.env.sampling]]` / `[[orchestrator.eval.env.sampling]]`). `min_tokens` suppressed natural EOS, `repetition_penalty` distorts the on-policy sampling distribution, and `seed` wasn't pulling its weight — none belonged on the supported config surface. Existing configs setting any of these must delete the field. Hard-deprecation, no migration window. (2026-05-27)
 - **`wandb.shared` removed**: The deprecation shim that popped `wandb.shared` from input dicts with a `FutureWarning` (introduced in #2649) is gone. The `rl` entrypoint always uses shared W&B mode now, and existing configs that still set `wandb.shared = true` (or `false`) will fail validation. Drop the field from your config. (2026-05-27)
 - **`max_async_level` and `strict_async_level` removed**: The async-execution semantics between trainer and orchestrator are now design invariants, not config knobs. The trainer always runs exactly one step ahead of inference, and the orchestrator always adopts the freshest checkpoint that doesn't violate the one-step barrier. The shared top-level `max_async_level`, the per-sub-config `trainer.max_async_level` / `orchestrator.max_async_level`, and `orchestrator.strict_async_level` have all been removed. Existing configs setting any of these must drop the field; the previous defaults (`max_async_level = 1`, `strict_async_level = false`) match the new hardcoded behavior. Bench mode no longer bypasses the weight-ckpt wait (the `int(1e9)` workaround is gone) and `multimodal/rl_color_codeword_feat_renderer.toml`'s prior `max_async_level = 0` (fully synchronous on-policy) is no longer expressible. (2026-05-25)

diff --git a/configs/ci/integration/alphabet_sort.toml b/configs/ci/integration/alphabet_sort.toml
@@ -1,4 +1,4 @@
-max_steps = 5
+max_steps = 10
 seq_len = 2048
 
 [ckpt]

diff --git a/configs/hendrycks_math/sanity.toml → configs/debug/hendrycks_sanity/rl.toml b/configs/hendrycks_math/sanity.toml → configs/debug/hendrycks_sanity/rl.toml
@@ -1,5 +1,5 @@
 max_steps = 3000
-seq_len = 8192
+seq_len = 4096
 
 [wandb]
 project = "hendrycks-math-debug"
@@ -9,7 +9,8 @@ name = "hendrycks-math-sanity"
 name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
 [orchestrator]
-batch_size = 512
+batch_size = 256
+max_inflight_rollouts = 512
 group_size = 8
 
 [[orchestrator.train.env]]
@@ -30,7 +31,7 @@ group_size = 16
 [trainer.model.compile]
 
 [inference.model]
-max_model_len = 8192
+max_model_len = 4096
 
 [log]
 level = "debug"

diff --git a/configs/debug/multi_env/reverse_text.toml b/configs/debug/multi_env/reverse_text.toml
@@ -0,0 +1,52 @@
+max_steps = 20
+seq_len = 2048
+
+[model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
+
+[wandb]
+project = "reverse-text-debug"
+name = "debug-multi-env"
+
+[orchestrator]
+training_mode = "rl"
+batch_size = 128
+group_size = 16
+
+[orchestrator.renderer]
+name = "qwen3"
+
+# -- multi train envs --
+
+[[orchestrator.train.env]]
+id = "reverse-text"
+name = "reverse-text-train-1"
+
+[[orchestrator.train.env]]
+id = "reverse-text"
+name = "reverse-text-train-2"
+
+# -- multi eval envs --
+
+[orchestrator.eval]
+interval = 10
+num_examples = 16
+group_size = 4
+
+[[orchestrator.eval.env]]
+id = "reverse-text"
+name = "reverse-text-eval-1"
+
+[[orchestrator.eval.env]]
+id = "reverse-text"
+name = "reverse-text-eval-2"
+interval = 5
+
+[trainer.optim]
+lr = 3e-6
+
+[inference]
+gpu_memory_utilization = 0.5
+
+[inference.model]
+max_model_len = 128
diff --git a/configs/debug/multimodal.toml b/configs/debug/multimodal.toml
@@ -0,0 +1,75 @@
+# 2-GPU debug RL run for the multimodal (renderer) path: Qwen3-VL-4B on
+# color-codeword. Sized to actually learn (reward should trend up) while
+# staying 2-GPU friendly. Exercises RendererClient + Qwen3VLRenderer end-to-end.
+
+max_steps = 15
+seq_len = 4096
+
+[model]
+name = "Qwen/Qwen3-VL-4B-Instruct"
+
+[model.vlm]
+vision_encoder_attr = "model.visual"
+language_model_attr = "model.language_model"
+
+[deployment]
+num_train_gpus = 1
+num_infer_gpus = 1
+gpus_per_node = 2
+
+[orchestrator]
+batch_size = 256
+group_size = 16
+# Image processor is CPU-bound and dominates for VLMs; returns diminish past 4.
+pool_size = 4
+
+# Step 0 on Qwen3-VL-4B vs color-codeword can be uniform (all-correct or
+# all-wrong), so don't enforce zero-advantage dropping or training would crash
+# before any progress.
+[[orchestrator.filters]]
+type = "gibberish"
+
+[[orchestrator.filters]]
+type = "repetition"
+
+[[orchestrator.filters]]
+type = "zero_advantage"
+enforce = false
+
+[orchestrator.train.sampling]
+max_completion_tokens = 64
+
+[[orchestrator.train.env]]
+id = "color-codeword"
+args = { images_per_turn = 1, max_turns = 3, num_examples = 1000, seed = 42 }
+
+# Default renderer (AutoRendererConfig) resolves Qwen3-VL-4B-Instruct from
+# MODEL_RENDERER_MAP to Qwen3VLRenderer at runtime; no explicit name needed.
+
+[trainer]
+
+[trainer.model]
+optimization_dtype = "bfloat16"
+reduce_dtype = "bfloat16"
+
+[trainer.optim]
+lr = 3e-6
+
+[inference]
+
+[inference.model]
+# Workaround for vLLM 0.20.1 Qwen3-VL deepstack buffer bug: when num_scheduled_tokens
+# (188) gets padded up to the next cudagraph_capture_size (192), the model's
+# _set_deepstack_input_embeds sizes the buffer to 188 but forward() runs with 192,
+# triggering "Requested more deepstack tokens than available in buffer". Eager mode
+# skips the padding so num_input_tokens == num_scheduled_tokens.
+enforce_eager = true
+
+[inference.parallel]
+dp = 1
+tp = 1
+
+[wandb]
+project = "debug"
+name = "multimodal"
+tags = ["qwen3vl-4b", "color-codeword", "renderer"]
diff --git a/configs/hendrycks_math/rl.toml b/configs/hendrycks_math/rl.toml
@@ -20,10 +20,6 @@ id = "math-env"
 name = "hendrycks-math"
 args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "default", math_verify_max_workers = 128, math_verify_timeout = 60 }
 
-[orchestrator.buffer]
-easy_threshold = 1.0
-hard_threshold = 0.0
-
 [orchestrator.eval]
 interval = 10
 

diff --git a/configs/math_group/rl.toml b/configs/math_group/rl.toml
@@ -23,10 +23,6 @@ name = "acereason-math"
 args = { dataset_name = "nvidia/AceReason-Math", dataset_subset = "default", question_key = "problem" }
 ratio = 0.5
 
-[orchestrator.buffer]
-easy_threshold = 1.0
-hard_threshold = 0.0
-
 [orchestrator.eval]
 interval = 50
 

diff --git a/configs/multi_reverse_text/rl.toml b/configs/multi_reverse_text/rl.toml
diff --git a/configs/multimodal/rl_color_codeword.toml b/configs/multimodal/rl_color_codeword.toml
diff --git a/configs/multimodal/rl_color_codeword_feat_renderer.toml b/configs/multimodal/rl_color_codeword_feat_renderer.toml