PrimeIntellect-ai · hallerite · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/renderers/base.py b/renderers/base.py
@@ -1098,8 +1098,7 @@ def _patched_load(model_name_or_path: str, **kwargs):
             fastokens.patch_transformers()
         if not _FASTOKENS_ANNOUNCED:
             logger.info(
-                "fastokens enabled — tokenizers load through the Rust BPE "
-                "fast path (~10x encode speedup)."
+                "fastokens enabled — tokenizers load through the Rust BPE fast path (~10x encode speedup)."
             )
             _FASTOKENS_ANNOUNCED = True
     try:
@@ -1169,8 +1168,8 @@ def load_tokenizer(
 def _populate_registry():
     if RENDERER_REGISTRY:
         return
-    from renderers.default import DefaultRenderer
     from renderers.deepseek_v3 import DeepSeekV3Renderer
+    from renderers.default import DefaultRenderer
     from renderers.glm5 import GLM5Renderer, GLM51Renderer
     from renderers.glm45 import GLM45Renderer
     from renderers.gpt_oss import GptOssRenderer
@@ -1271,8 +1270,7 @@ def create_renderer(
         cls = RENDERER_REGISTRY.get(config.name)
         if cls is None:
             raise ValueError(
-                f"Unknown renderer {config.name!r}. "
-                f"Available: {', '.join(sorted(RENDERER_REGISTRY))}"
+                f"Unknown renderer {config.name!r}. Available: {', '.join(sorted(RENDERER_REGISTRY))}"
             )
         return cls(tokenizer, config)
 
@@ -1345,7 +1343,7 @@ def build_training_sample(
     renderer: Renderer,
     messages: list[Message],
     *,
-    role_to_mask: Callable[[Message], bool],
+    role_to_mask: Callable[[Message], bool] | None = None,
     tools: list[ToolSpec] | None = None,
     content_sft_roles: "set[str] | frozenset[str] | None" = None,
 ) -> tuple[list[int], list[bool]]:
@@ -1354,15 +1352,31 @@ def build_training_sample(
     Single render() call + message_indices → per-token mask.
     Replaces build_incremental_token_mask (O(N) renders → O(1)).
 
-    When the renderer populates ``rendered.sampled_mask``, the loss mask
-    is the AND of role-based attribution and the sampled signal: only
-    tokens the model would have produced at inference are trainable.
-    This keeps SFT byte-aligned with the RL trajectory mask (where the
-    prompt / completion split achieves the same effect structurally).
+    When ``role_to_mask`` is omitted, ``loss_mask`` is the renderer's
+    ``sampled_mask`` directly: every token the model would have
+    produced at inference is trainable, regardless of which message
+    it's attributed to. This is the recommended default for renderer
+    callers — the renderer owns the per-token "is this model output"
+    signal, so role-level filtering becomes a downstream constraint
+    rather than a precondition. (Some role markers — e.g. GLM
+    ``<|user|>`` / ``<|observation|>`` after a tool-calling assistant
+    turn — *are* sampled by the model at inference and live inside the
+    next message's span; ``sampled_mask`` captures that, but a
+    naive role filter would mask them out.)
+
+    When ``role_to_mask`` is provided, ``loss_mask`` is the AND of the
+    role-based attribution and the sampled signal: only tokens the
+    model would have produced at inference AND attributed to a
+    trainable role pass through. Useful when the caller needs to
+    restrict training to a specific role (e.g. assistant-only) even on
+    a renderer whose ``sampled_mask`` already covers other roles.
+
     Renderers that don't populate ``sampled_mask`` (empty list) fall
     back to attribution-only masking — every token attributed to a
     trainable role is trained on, including template-injected
-    ``<|im_start|>role\\n`` openers.
+    ``<|im_start|>role\\n`` openers. In this fallback mode
+    ``role_to_mask`` is required; calling without it raises
+    ``ValueError``.
 
     ``content_sft_roles`` opts in additional roles for "body-only"
     supervision: for every message whose role is in this set, tokens
@@ -1393,6 +1407,13 @@ def build_training_sample(
     else:
         body_roles = frozenset()
 
+    if role_to_mask is None and not has_sampled_info:
+        raise ValueError(
+            "role_to_mask is required when the renderer does not populate "
+            "sampled_mask. Pass an explicit role filter (e.g. "
+            "lambda m: m['role'] == 'assistant') for this renderer."
+        )
+
     loss_mask: list[bool] = []
     for k, msg_idx in enumerate(rendered.message_indices):
         if msg_idx < 0:
@@ -1408,6 +1429,11 @@ def build_training_sample(
             continue
         if has_sampled_info and not rendered.sampled_mask[k]:
             loss_mask.append(False)
+        elif role_to_mask is None:
+            # sampled_mask alone gates the loss when no role filter is
+            # supplied. ``sampled_mask[k]`` is True here (handled by the
+            # branch above), so this token is trainable.
+            loss_mask.append(True)
         else:
             loss_mask.append(role_to_mask(msg))
     return rendered.token_ids, loss_mask

diff --git a/renderers/glm45.py b/renderers/glm45.py
@@ -184,6 +184,22 @@ def emit_text_segments(
             role = msg["role"]
             content = self._visible_text(msg.get("content"))
 
+            # When the previous message is an assistant, this message's
+            # role-opening token (``<|user|>`` / ``<|observation|>``) is
+            # the inference-time stop signal that closes the assistant's
+            # turn (see ``get_stop_token_ids``). Mark it
+            # ``is_sampled=True`` so the loss-mask pipeline trains the
+            # model to emit it after ``</tool_call>`` (instead of
+            # continuing with another ``<tool_call>`` block). The token
+            # stays attributed to this message (msg_idx=i) and remains
+            # ``is_content=False`` — it's a role-marker / scaffold, not
+            # body bytes, so ``content_mask_for_roles({"tool"})`` and
+            # ``content_token_spans_by_role()`` correctly exclude it
+            # from "tool body" views. Byte stream is unchanged.
+            # ``system`` only appears at the start of a GLM conversation,
+            # so its opener is never the closer of an assistant turn.
+            closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant"
+
             if role == "system":
                 emit_special(self._system, i, is_sampled=False, is_content=False)
                 # ``\n`` is the scaffold separator after the role tag;
@@ -193,7 +209,12 @@ def emit_text_segments(
                 )
 
             elif role == "user":
-                emit_special(self._user, i, is_sampled=False, is_content=False)
+                emit_special(
+                    self._user,
+                    i,
+                    is_sampled=closes_assistant_turn,
+                    is_content=False,
+                )
                 # ``\n`` is scaffold; ``content`` is body; the optional
                 # ``/nothink`` suffix is scaffold the renderer injects
                 # when ``enable_thinking=False``.
@@ -362,6 +383,21 @@ def emit_text_segments(
                 ext_sampled.append(is_sampled)
                 ext_content.append(is_content)
 
+        # The opener-token of the first new_message may also serve as
+        # the close of the previous assistant turn (when the model
+        # failed to sample the stop token itself and the bridge has to
+        # synthesize the boundary above). Unlike :meth:`render`, the
+        # bridge emits these with ``is_sampled=False, is_content=False``
+        # — they are template scaffolding for the *next* step's prompt,
+        # not tokens the model produced *in this* step. The RL loss
+        # operates on ``previous_completion_ids`` (what the model
+        # actually sampled this round); bridge tokens belong to the
+        # subsequent prompt and must not be counted as "model output"
+        # by downstream mask consumers. This deliberate disagreement
+        # with ``render()`` reflects the SFT vs RL semantics: render's
+        # masks describe what the model *should* produce given a
+        # complete conversation; bridge's masks describe what it
+        # *actually* produced this step.
         for i, msg in enumerate(new_messages):
             role = msg.get("role")
             content = self._visible_text(msg.get("content"))
@@ -531,21 +567,24 @@ def _render_tool(
         emit_text,
         emit_text_segments,
     ) -> None:
-        # Tool messages are conversation history injected by the runtime
-        # between assistant turns — the model never samples any of these
-        # tokens, so every emission is is_sampled=False. The body bytes
-        # get ``is_content=True``; the ``\n<tool_response>\n`` /
-        # ``\n</tool_response>`` wraps and the ``<|observation|>`` role
-        # tag are scaffold so the SFT mask for tool body never trains
-        # the model to emit them. Single BPE pass over the joined text
-        # preserves boundary merges (the tool body's leading/trailing
-        # chars can merge with the wrap's ``\n``s if the tokenizer would
-        # do so; we route through ``emit_text_segments`` so the
-        # attribution is offset-driven and tokenizer-agnostic).
-        prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
-
-        if not prev_is_tool:
-            emit_special(self._observation, msg_idx, is_sampled=False, is_content=False)
+        # Tool body bytes get ``is_content=True``; the wraps are
+        # scaffold. The ``<|observation|>`` role tag is scaffold too
+        # (``is_content=False`` so ``content_mask_for_roles({"tool"})``
+        # excludes it). When the previous message is an assistant it
+        # doubles as the inference stop signal for that assistant's
+        # turn — mark it ``is_sampled=True`` so SFT trains the model to
+        # emit it after ``</tool_call>``. The token stays attributed to
+        # this tool message; byte stream is unchanged.
+        prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None
+        closes_assistant_turn = prev_role == "assistant"
+
+        if prev_role != "tool":
+            emit_special(
+                self._observation,
+                msg_idx,
+                is_sampled=closes_assistant_turn,
+                is_content=False,
+            )
 
         emit_text_segments(
             [

diff --git a/renderers/glm5.py b/renderers/glm5.py
@@ -207,12 +207,33 @@ def emit_text_segments(
             role = msg["role"]
             content = self._visible_text(msg.get("content"))
 
+            # When the previous message is an assistant, this message's
+            # role-opening token (``<|user|>`` / ``<|observation|>``) is
+            # the inference-time stop signal that closes the assistant's
+            # turn (see ``get_stop_token_ids``). Mark it
+            # ``is_sampled=True`` so the loss-mask pipeline trains the
+            # model to emit it after ``</tool_call>`` (instead of
+            # continuing with another ``<tool_call>`` block). The token
+            # stays attributed to this message (msg_idx=i) and remains
+            # ``is_content=False`` — it's a role-marker / scaffold, not
+            # body bytes, so ``content_mask_for_roles({"tool"})`` and
+            # ``content_token_spans_by_role()`` correctly exclude it
+            # from "tool body" views. Byte stream is unchanged.
+            # ``system`` only appears at the start of a GLM conversation,
+            # so its opener is never the closer of an assistant turn.
+            closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant"
+
             if role == "system":
                 emit_special(self._system, i, is_sampled=False, is_content=False)
                 emit_text(content, i, is_sampled=False, is_content=True)
 
             elif role == "user":
-                emit_special(self._user, i, is_sampled=False, is_content=False)
+                emit_special(
+                    self._user,
+                    i,
+                    is_sampled=closes_assistant_turn,
+                    is_content=False,
+                )
                 emit_text(content, i, is_sampled=False, is_content=True)
 
             elif role == "assistant":
@@ -382,6 +403,21 @@ def emit_text_segments(
                 ext_sampled.append(is_sampled)
                 ext_content.append(is_content)
 
+        # The opener-token of the first new_message may also serve as
+        # the close of the previous assistant turn (when the model
+        # failed to sample the stop token itself and the bridge has to
+        # synthesize the boundary above). Unlike :meth:`render`, the
+        # bridge emits these with ``is_sampled=False, is_content=False``
+        # — they are template scaffolding for the *next* step's prompt,
+        # not tokens the model produced *in this* step. The RL loss
+        # operates on ``previous_completion_ids`` (what the model
+        # actually sampled this round); bridge tokens belong to the
+        # subsequent prompt and must not be counted as "model output"
+        # by downstream mask consumers. This deliberate disagreement
+        # with ``render()`` reflects the SFT vs RL semantics: render's
+        # masks describe what the model *should* produce given a
+        # complete conversation; bridge's masks describe what it
+        # *actually* produced this step.
         for i, msg in enumerate(new_messages):
             role = msg.get("role")
             content = self._visible_text(msg.get("content"))
@@ -566,16 +602,24 @@ def _render_tool(
         emit_text,
         emit_text_segments,
     ) -> None:
-        # Tool messages are conversation history injected by the runtime
-        # between assistant turns — the model never samples any of these
-        # tokens, so every emission is is_sampled=False. The tool body
-        # bytes get ``is_content=True``; the ``<|observation|>`` /
-        # ``<tool_response>`` wraps are scaffold so the SFT mask for
-        # tool body never trains the model to emit them.
-        prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
-
-        if not prev_is_tool:
-            emit_special(self._observation, msg_idx, is_sampled=False, is_content=False)
+        # Tool body bytes get ``is_content=True``; the wraps are
+        # scaffold. The ``<|observation|>`` role tag is scaffold too
+        # (``is_content=False`` so ``content_mask_for_roles({"tool"})``
+        # excludes it). When the previous message is an assistant it
+        # doubles as the inference stop signal for that assistant's
+        # turn — mark it ``is_sampled=True`` so SFT trains the model to
+        # emit it after ``</tool_call>``. The token stays attributed to
+        # this tool message; byte stream is unchanged.
+        prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None
+        closes_assistant_turn = prev_role == "assistant"
+
+        if prev_role != "tool":
+            emit_special(
+                self._observation,
+                msg_idx,
+                is_sampled=closes_assistant_turn,
+                is_content=False,
+            )
 
         emit_special(
             self._tool_response_tok, msg_idx, is_sampled=False, is_content=False