diff --git a/renderers/base.py b/renderers/base.py index 65edf68..e9805c4 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1098,8 +1098,7 @@ def _patched_load(model_name_or_path: str, **kwargs): fastokens.patch_transformers() if not _FASTOKENS_ANNOUNCED: logger.info( - "fastokens enabled — tokenizers load through the Rust BPE " - "fast path (~10x encode speedup)." + "fastokens enabled — tokenizers load through the Rust BPE fast path (~10x encode speedup)." ) _FASTOKENS_ANNOUNCED = True try: @@ -1169,8 +1168,8 @@ def load_tokenizer( def _populate_registry(): if RENDERER_REGISTRY: return - from renderers.default import DefaultRenderer from renderers.deepseek_v3 import DeepSeekV3Renderer + from renderers.default import DefaultRenderer from renderers.glm5 import GLM5Renderer, GLM51Renderer from renderers.glm45 import GLM45Renderer from renderers.gpt_oss import GptOssRenderer @@ -1271,8 +1270,7 @@ def create_renderer( cls = RENDERER_REGISTRY.get(config.name) if cls is None: raise ValueError( - f"Unknown renderer {config.name!r}. " - f"Available: {', '.join(sorted(RENDERER_REGISTRY))}" + f"Unknown renderer {config.name!r}. Available: {', '.join(sorted(RENDERER_REGISTRY))}" ) return cls(tokenizer, config) @@ -1345,7 +1343,7 @@ def build_training_sample( renderer: Renderer, messages: list[Message], *, - role_to_mask: Callable[[Message], bool], + role_to_mask: Callable[[Message], bool] | None = None, tools: list[ToolSpec] | None = None, content_sft_roles: "set[str] | frozenset[str] | None" = None, ) -> tuple[list[int], list[bool]]: @@ -1354,15 +1352,31 @@ def build_training_sample( Single render() call + message_indices → per-token mask. Replaces build_incremental_token_mask (O(N) renders → O(1)). - When the renderer populates ``rendered.sampled_mask``, the loss mask - is the AND of role-based attribution and the sampled signal: only - tokens the model would have produced at inference are trainable. - This keeps SFT byte-aligned with the RL trajectory mask (where the - prompt / completion split achieves the same effect structurally). + When ``role_to_mask`` is omitted, ``loss_mask`` is the renderer's + ``sampled_mask`` directly: every token the model would have + produced at inference is trainable, regardless of which message + it's attributed to. This is the recommended default for renderer + callers — the renderer owns the per-token "is this model output" + signal, so role-level filtering becomes a downstream constraint + rather than a precondition. (Some role markers — e.g. GLM + ``<|user|>`` / ``<|observation|>`` after a tool-calling assistant + turn — *are* sampled by the model at inference and live inside the + next message's span; ``sampled_mask`` captures that, but a + naive role filter would mask them out.) + + When ``role_to_mask`` is provided, ``loss_mask`` is the AND of the + role-based attribution and the sampled signal: only tokens the + model would have produced at inference AND attributed to a + trainable role pass through. Useful when the caller needs to + restrict training to a specific role (e.g. assistant-only) even on + a renderer whose ``sampled_mask`` already covers other roles. + Renderers that don't populate ``sampled_mask`` (empty list) fall back to attribution-only masking — every token attributed to a trainable role is trained on, including template-injected - ``<|im_start|>role\\n`` openers. + ``<|im_start|>role\\n`` openers. In this fallback mode + ``role_to_mask`` is required; calling without it raises + ``ValueError``. ``content_sft_roles`` opts in additional roles for "body-only" supervision: for every message whose role is in this set, tokens @@ -1393,6 +1407,13 @@ def build_training_sample( else: body_roles = frozenset() + if role_to_mask is None and not has_sampled_info: + raise ValueError( + "role_to_mask is required when the renderer does not populate " + "sampled_mask. Pass an explicit role filter (e.g. " + "lambda m: m['role'] == 'assistant') for this renderer." + ) + loss_mask: list[bool] = [] for k, msg_idx in enumerate(rendered.message_indices): if msg_idx < 0: @@ -1408,6 +1429,11 @@ def build_training_sample( continue if has_sampled_info and not rendered.sampled_mask[k]: loss_mask.append(False) + elif role_to_mask is None: + # sampled_mask alone gates the loss when no role filter is + # supplied. ``sampled_mask[k]`` is True here (handled by the + # branch above), so this token is trainable. + loss_mask.append(True) else: loss_mask.append(role_to_mask(msg)) return rendered.token_ids, loss_mask diff --git a/renderers/glm45.py b/renderers/glm45.py index ed0e0b7..efea47b 100644 --- a/renderers/glm45.py +++ b/renderers/glm45.py @@ -184,6 +184,22 @@ def emit_text_segments( role = msg["role"] content = self._visible_text(msg.get("content")) + # When the previous message is an assistant, this message's + # role-opening token (``<|user|>`` / ``<|observation|>``) is + # the inference-time stop signal that closes the assistant's + # turn (see ``get_stop_token_ids``). Mark it + # ``is_sampled=True`` so the loss-mask pipeline trains the + # model to emit it after ```` (instead of + # continuing with another ```` block). The token + # stays attributed to this message (msg_idx=i) and remains + # ``is_content=False`` — it's a role-marker / scaffold, not + # body bytes, so ``content_mask_for_roles({"tool"})`` and + # ``content_token_spans_by_role()`` correctly exclude it + # from "tool body" views. Byte stream is unchanged. + # ``system`` only appears at the start of a GLM conversation, + # so its opener is never the closer of an assistant turn. + closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant" + if role == "system": emit_special(self._system, i, is_sampled=False, is_content=False) # ``\n`` is the scaffold separator after the role tag; @@ -193,7 +209,12 @@ def emit_text_segments( ) elif role == "user": - emit_special(self._user, i, is_sampled=False, is_content=False) + emit_special( + self._user, + i, + is_sampled=closes_assistant_turn, + is_content=False, + ) # ``\n`` is scaffold; ``content`` is body; the optional # ``/nothink`` suffix is scaffold the renderer injects # when ``enable_thinking=False``. @@ -362,6 +383,21 @@ def emit_text_segments( ext_sampled.append(is_sampled) ext_content.append(is_content) + # The opener-token of the first new_message may also serve as + # the close of the previous assistant turn (when the model + # failed to sample the stop token itself and the bridge has to + # synthesize the boundary above). Unlike :meth:`render`, the + # bridge emits these with ``is_sampled=False, is_content=False`` + # — they are template scaffolding for the *next* step's prompt, + # not tokens the model produced *in this* step. The RL loss + # operates on ``previous_completion_ids`` (what the model + # actually sampled this round); bridge tokens belong to the + # subsequent prompt and must not be counted as "model output" + # by downstream mask consumers. This deliberate disagreement + # with ``render()`` reflects the SFT vs RL semantics: render's + # masks describe what the model *should* produce given a + # complete conversation; bridge's masks describe what it + # *actually* produced this step. for i, msg in enumerate(new_messages): role = msg.get("role") content = self._visible_text(msg.get("content")) @@ -531,21 +567,24 @@ def _render_tool( emit_text, emit_text_segments, ) -> None: - # Tool messages are conversation history injected by the runtime - # between assistant turns — the model never samples any of these - # tokens, so every emission is is_sampled=False. The body bytes - # get ``is_content=True``; the ``\n\n`` / - # ``\n`` wraps and the ``<|observation|>`` role - # tag are scaffold so the SFT mask for tool body never trains - # the model to emit them. Single BPE pass over the joined text - # preserves boundary merges (the tool body's leading/trailing - # chars can merge with the wrap's ``\n``s if the tokenizer would - # do so; we route through ``emit_text_segments`` so the - # attribution is offset-driven and tokenizer-agnostic). - prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool" - - if not prev_is_tool: - emit_special(self._observation, msg_idx, is_sampled=False, is_content=False) + # Tool body bytes get ``is_content=True``; the wraps are + # scaffold. The ``<|observation|>`` role tag is scaffold too + # (``is_content=False`` so ``content_mask_for_roles({"tool"})`` + # excludes it). When the previous message is an assistant it + # doubles as the inference stop signal for that assistant's + # turn — mark it ``is_sampled=True`` so SFT trains the model to + # emit it after ````. The token stays attributed to + # this tool message; byte stream is unchanged. + prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None + closes_assistant_turn = prev_role == "assistant" + + if prev_role != "tool": + emit_special( + self._observation, + msg_idx, + is_sampled=closes_assistant_turn, + is_content=False, + ) emit_text_segments( [ diff --git a/renderers/glm5.py b/renderers/glm5.py index f3e28e3..a42a0af 100644 --- a/renderers/glm5.py +++ b/renderers/glm5.py @@ -207,12 +207,33 @@ def emit_text_segments( role = msg["role"] content = self._visible_text(msg.get("content")) + # When the previous message is an assistant, this message's + # role-opening token (``<|user|>`` / ``<|observation|>``) is + # the inference-time stop signal that closes the assistant's + # turn (see ``get_stop_token_ids``). Mark it + # ``is_sampled=True`` so the loss-mask pipeline trains the + # model to emit it after ```` (instead of + # continuing with another ```` block). The token + # stays attributed to this message (msg_idx=i) and remains + # ``is_content=False`` — it's a role-marker / scaffold, not + # body bytes, so ``content_mask_for_roles({"tool"})`` and + # ``content_token_spans_by_role()`` correctly exclude it + # from "tool body" views. Byte stream is unchanged. + # ``system`` only appears at the start of a GLM conversation, + # so its opener is never the closer of an assistant turn. + closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant" + if role == "system": emit_special(self._system, i, is_sampled=False, is_content=False) emit_text(content, i, is_sampled=False, is_content=True) elif role == "user": - emit_special(self._user, i, is_sampled=False, is_content=False) + emit_special( + self._user, + i, + is_sampled=closes_assistant_turn, + is_content=False, + ) emit_text(content, i, is_sampled=False, is_content=True) elif role == "assistant": @@ -382,6 +403,21 @@ def emit_text_segments( ext_sampled.append(is_sampled) ext_content.append(is_content) + # The opener-token of the first new_message may also serve as + # the close of the previous assistant turn (when the model + # failed to sample the stop token itself and the bridge has to + # synthesize the boundary above). Unlike :meth:`render`, the + # bridge emits these with ``is_sampled=False, is_content=False`` + # — they are template scaffolding for the *next* step's prompt, + # not tokens the model produced *in this* step. The RL loss + # operates on ``previous_completion_ids`` (what the model + # actually sampled this round); bridge tokens belong to the + # subsequent prompt and must not be counted as "model output" + # by downstream mask consumers. This deliberate disagreement + # with ``render()`` reflects the SFT vs RL semantics: render's + # masks describe what the model *should* produce given a + # complete conversation; bridge's masks describe what it + # *actually* produced this step. for i, msg in enumerate(new_messages): role = msg.get("role") content = self._visible_text(msg.get("content")) @@ -566,16 +602,24 @@ def _render_tool( emit_text, emit_text_segments, ) -> None: - # Tool messages are conversation history injected by the runtime - # between assistant turns — the model never samples any of these - # tokens, so every emission is is_sampled=False. The tool body - # bytes get ``is_content=True``; the ``<|observation|>`` / - # ```` wraps are scaffold so the SFT mask for - # tool body never trains the model to emit them. - prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool" - - if not prev_is_tool: - emit_special(self._observation, msg_idx, is_sampled=False, is_content=False) + # Tool body bytes get ``is_content=True``; the wraps are + # scaffold. The ``<|observation|>`` role tag is scaffold too + # (``is_content=False`` so ``content_mask_for_roles({"tool"})`` + # excludes it). When the previous message is an assistant it + # doubles as the inference stop signal for that assistant's + # turn — mark it ``is_sampled=True`` so SFT trains the model to + # emit it after ````. The token stays attributed to + # this tool message; byte stream is unchanged. + prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None + closes_assistant_turn = prev_role == "assistant" + + if prev_role != "tool": + emit_special( + self._observation, + msg_idx, + is_sampled=closes_assistant_turn, + is_content=False, + ) emit_special( self._tool_response_tok, msg_idx, is_sampled=False, is_content=False