diff --git a/renderers/base.py b/renderers/base.py
index 65edf68..e9805c4 100644
--- a/renderers/base.py
+++ b/renderers/base.py
@@ -1098,8 +1098,7 @@ def _patched_load(model_name_or_path: str, **kwargs):
fastokens.patch_transformers()
if not _FASTOKENS_ANNOUNCED:
logger.info(
- "fastokens enabled — tokenizers load through the Rust BPE "
- "fast path (~10x encode speedup)."
+ "fastokens enabled — tokenizers load through the Rust BPE fast path (~10x encode speedup)."
)
_FASTOKENS_ANNOUNCED = True
try:
@@ -1169,8 +1168,8 @@ def load_tokenizer(
def _populate_registry():
if RENDERER_REGISTRY:
return
- from renderers.default import DefaultRenderer
from renderers.deepseek_v3 import DeepSeekV3Renderer
+ from renderers.default import DefaultRenderer
from renderers.glm5 import GLM5Renderer, GLM51Renderer
from renderers.glm45 import GLM45Renderer
from renderers.gpt_oss import GptOssRenderer
@@ -1271,8 +1270,7 @@ def create_renderer(
cls = RENDERER_REGISTRY.get(config.name)
if cls is None:
raise ValueError(
- f"Unknown renderer {config.name!r}. "
- f"Available: {', '.join(sorted(RENDERER_REGISTRY))}"
+ f"Unknown renderer {config.name!r}. Available: {', '.join(sorted(RENDERER_REGISTRY))}"
)
return cls(tokenizer, config)
@@ -1345,7 +1343,7 @@ def build_training_sample(
renderer: Renderer,
messages: list[Message],
*,
- role_to_mask: Callable[[Message], bool],
+ role_to_mask: Callable[[Message], bool] | None = None,
tools: list[ToolSpec] | None = None,
content_sft_roles: "set[str] | frozenset[str] | None" = None,
) -> tuple[list[int], list[bool]]:
@@ -1354,15 +1352,31 @@ def build_training_sample(
Single render() call + message_indices → per-token mask.
Replaces build_incremental_token_mask (O(N) renders → O(1)).
- When the renderer populates ``rendered.sampled_mask``, the loss mask
- is the AND of role-based attribution and the sampled signal: only
- tokens the model would have produced at inference are trainable.
- This keeps SFT byte-aligned with the RL trajectory mask (where the
- prompt / completion split achieves the same effect structurally).
+ When ``role_to_mask`` is omitted, ``loss_mask`` is the renderer's
+ ``sampled_mask`` directly: every token the model would have
+ produced at inference is trainable, regardless of which message
+ it's attributed to. This is the recommended default for renderer
+ callers — the renderer owns the per-token "is this model output"
+ signal, so role-level filtering becomes a downstream constraint
+ rather than a precondition. (Some role markers — e.g. GLM
+ ``<|user|>`` / ``<|observation|>`` after a tool-calling assistant
+ turn — *are* sampled by the model at inference and live inside the
+ next message's span; ``sampled_mask`` captures that, but a
+ naive role filter would mask them out.)
+
+ When ``role_to_mask`` is provided, ``loss_mask`` is the AND of the
+ role-based attribution and the sampled signal: only tokens the
+ model would have produced at inference AND attributed to a
+ trainable role pass through. Useful when the caller needs to
+ restrict training to a specific role (e.g. assistant-only) even on
+ a renderer whose ``sampled_mask`` already covers other roles.
+
Renderers that don't populate ``sampled_mask`` (empty list) fall
back to attribution-only masking — every token attributed to a
trainable role is trained on, including template-injected
- ``<|im_start|>role\\n`` openers.
+ ``<|im_start|>role\\n`` openers. In this fallback mode
+ ``role_to_mask`` is required; calling without it raises
+ ``ValueError``.
``content_sft_roles`` opts in additional roles for "body-only"
supervision: for every message whose role is in this set, tokens
@@ -1393,6 +1407,13 @@ def build_training_sample(
else:
body_roles = frozenset()
+ if role_to_mask is None and not has_sampled_info:
+ raise ValueError(
+ "role_to_mask is required when the renderer does not populate "
+ "sampled_mask. Pass an explicit role filter (e.g. "
+ "lambda m: m['role'] == 'assistant') for this renderer."
+ )
+
loss_mask: list[bool] = []
for k, msg_idx in enumerate(rendered.message_indices):
if msg_idx < 0:
@@ -1408,6 +1429,11 @@ def build_training_sample(
continue
if has_sampled_info and not rendered.sampled_mask[k]:
loss_mask.append(False)
+ elif role_to_mask is None:
+ # sampled_mask alone gates the loss when no role filter is
+ # supplied. ``sampled_mask[k]`` is True here (handled by the
+ # branch above), so this token is trainable.
+ loss_mask.append(True)
else:
loss_mask.append(role_to_mask(msg))
return rendered.token_ids, loss_mask
diff --git a/renderers/glm45.py b/renderers/glm45.py
index ed0e0b7..efea47b 100644
--- a/renderers/glm45.py
+++ b/renderers/glm45.py
@@ -184,6 +184,22 @@ def emit_text_segments(
role = msg["role"]
content = self._visible_text(msg.get("content"))
+ # When the previous message is an assistant, this message's
+ # role-opening token (``<|user|>`` / ``<|observation|>``) is
+ # the inference-time stop signal that closes the assistant's
+ # turn (see ``get_stop_token_ids``). Mark it
+ # ``is_sampled=True`` so the loss-mask pipeline trains the
+ # model to emit it after ```` (instead of
+ # continuing with another ```` block). The token
+ # stays attributed to this message (msg_idx=i) and remains
+ # ``is_content=False`` — it's a role-marker / scaffold, not
+ # body bytes, so ``content_mask_for_roles({"tool"})`` and
+ # ``content_token_spans_by_role()`` correctly exclude it
+ # from "tool body" views. Byte stream is unchanged.
+ # ``system`` only appears at the start of a GLM conversation,
+ # so its opener is never the closer of an assistant turn.
+ closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant"
+
if role == "system":
emit_special(self._system, i, is_sampled=False, is_content=False)
# ``\n`` is the scaffold separator after the role tag;
@@ -193,7 +209,12 @@ def emit_text_segments(
)
elif role == "user":
- emit_special(self._user, i, is_sampled=False, is_content=False)
+ emit_special(
+ self._user,
+ i,
+ is_sampled=closes_assistant_turn,
+ is_content=False,
+ )
# ``\n`` is scaffold; ``content`` is body; the optional
# ``/nothink`` suffix is scaffold the renderer injects
# when ``enable_thinking=False``.
@@ -362,6 +383,21 @@ def emit_text_segments(
ext_sampled.append(is_sampled)
ext_content.append(is_content)
+ # The opener-token of the first new_message may also serve as
+ # the close of the previous assistant turn (when the model
+ # failed to sample the stop token itself and the bridge has to
+ # synthesize the boundary above). Unlike :meth:`render`, the
+ # bridge emits these with ``is_sampled=False, is_content=False``
+ # — they are template scaffolding for the *next* step's prompt,
+ # not tokens the model produced *in this* step. The RL loss
+ # operates on ``previous_completion_ids`` (what the model
+ # actually sampled this round); bridge tokens belong to the
+ # subsequent prompt and must not be counted as "model output"
+ # by downstream mask consumers. This deliberate disagreement
+ # with ``render()`` reflects the SFT vs RL semantics: render's
+ # masks describe what the model *should* produce given a
+ # complete conversation; bridge's masks describe what it
+ # *actually* produced this step.
for i, msg in enumerate(new_messages):
role = msg.get("role")
content = self._visible_text(msg.get("content"))
@@ -531,21 +567,24 @@ def _render_tool(
emit_text,
emit_text_segments,
) -> None:
- # Tool messages are conversation history injected by the runtime
- # between assistant turns — the model never samples any of these
- # tokens, so every emission is is_sampled=False. The body bytes
- # get ``is_content=True``; the ``\n\n`` /
- # ``\n`` wraps and the ``<|observation|>`` role
- # tag are scaffold so the SFT mask for tool body never trains
- # the model to emit them. Single BPE pass over the joined text
- # preserves boundary merges (the tool body's leading/trailing
- # chars can merge with the wrap's ``\n``s if the tokenizer would
- # do so; we route through ``emit_text_segments`` so the
- # attribution is offset-driven and tokenizer-agnostic).
- prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
-
- if not prev_is_tool:
- emit_special(self._observation, msg_idx, is_sampled=False, is_content=False)
+ # Tool body bytes get ``is_content=True``; the wraps are
+ # scaffold. The ``<|observation|>`` role tag is scaffold too
+ # (``is_content=False`` so ``content_mask_for_roles({"tool"})``
+ # excludes it). When the previous message is an assistant it
+ # doubles as the inference stop signal for that assistant's
+ # turn — mark it ``is_sampled=True`` so SFT trains the model to
+ # emit it after ````. The token stays attributed to
+ # this tool message; byte stream is unchanged.
+ prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None
+ closes_assistant_turn = prev_role == "assistant"
+
+ if prev_role != "tool":
+ emit_special(
+ self._observation,
+ msg_idx,
+ is_sampled=closes_assistant_turn,
+ is_content=False,
+ )
emit_text_segments(
[
diff --git a/renderers/glm5.py b/renderers/glm5.py
index f3e28e3..a42a0af 100644
--- a/renderers/glm5.py
+++ b/renderers/glm5.py
@@ -207,12 +207,33 @@ def emit_text_segments(
role = msg["role"]
content = self._visible_text(msg.get("content"))
+ # When the previous message is an assistant, this message's
+ # role-opening token (``<|user|>`` / ``<|observation|>``) is
+ # the inference-time stop signal that closes the assistant's
+ # turn (see ``get_stop_token_ids``). Mark it
+ # ``is_sampled=True`` so the loss-mask pipeline trains the
+ # model to emit it after ```` (instead of
+ # continuing with another ```` block). The token
+ # stays attributed to this message (msg_idx=i) and remains
+ # ``is_content=False`` — it's a role-marker / scaffold, not
+ # body bytes, so ``content_mask_for_roles({"tool"})`` and
+ # ``content_token_spans_by_role()`` correctly exclude it
+ # from "tool body" views. Byte stream is unchanged.
+ # ``system`` only appears at the start of a GLM conversation,
+ # so its opener is never the closer of an assistant turn.
+ closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant"
+
if role == "system":
emit_special(self._system, i, is_sampled=False, is_content=False)
emit_text(content, i, is_sampled=False, is_content=True)
elif role == "user":
- emit_special(self._user, i, is_sampled=False, is_content=False)
+ emit_special(
+ self._user,
+ i,
+ is_sampled=closes_assistant_turn,
+ is_content=False,
+ )
emit_text(content, i, is_sampled=False, is_content=True)
elif role == "assistant":
@@ -382,6 +403,21 @@ def emit_text_segments(
ext_sampled.append(is_sampled)
ext_content.append(is_content)
+ # The opener-token of the first new_message may also serve as
+ # the close of the previous assistant turn (when the model
+ # failed to sample the stop token itself and the bridge has to
+ # synthesize the boundary above). Unlike :meth:`render`, the
+ # bridge emits these with ``is_sampled=False, is_content=False``
+ # — they are template scaffolding for the *next* step's prompt,
+ # not tokens the model produced *in this* step. The RL loss
+ # operates on ``previous_completion_ids`` (what the model
+ # actually sampled this round); bridge tokens belong to the
+ # subsequent prompt and must not be counted as "model output"
+ # by downstream mask consumers. This deliberate disagreement
+ # with ``render()`` reflects the SFT vs RL semantics: render's
+ # masks describe what the model *should* produce given a
+ # complete conversation; bridge's masks describe what it
+ # *actually* produced this step.
for i, msg in enumerate(new_messages):
role = msg.get("role")
content = self._visible_text(msg.get("content"))
@@ -566,16 +602,24 @@ def _render_tool(
emit_text,
emit_text_segments,
) -> None:
- # Tool messages are conversation history injected by the runtime
- # between assistant turns — the model never samples any of these
- # tokens, so every emission is is_sampled=False. The tool body
- # bytes get ``is_content=True``; the ``<|observation|>`` /
- # ```` wraps are scaffold so the SFT mask for
- # tool body never trains the model to emit them.
- prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
-
- if not prev_is_tool:
- emit_special(self._observation, msg_idx, is_sampled=False, is_content=False)
+ # Tool body bytes get ``is_content=True``; the wraps are
+ # scaffold. The ``<|observation|>`` role tag is scaffold too
+ # (``is_content=False`` so ``content_mask_for_roles({"tool"})``
+ # excludes it). When the previous message is an assistant it
+ # doubles as the inference stop signal for that assistant's
+ # turn — mark it ``is_sampled=True`` so SFT trains the model to
+ # emit it after ````. The token stays attributed to
+ # this tool message; byte stream is unchanged.
+ prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None
+ closes_assistant_turn = prev_role == "assistant"
+
+ if prev_role != "tool":
+ emit_special(
+ self._observation,
+ msg_idx,
+ is_sampled=closes_assistant_turn,
+ is_content=False,
+ )
emit_special(
self._tool_response_tok, msg_idx, is_sampled=False, is_content=False