Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 38 additions & 12 deletions renderers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,8 +1098,7 @@ def _patched_load(model_name_or_path: str, **kwargs):
fastokens.patch_transformers()
if not _FASTOKENS_ANNOUNCED:
logger.info(
"fastokens enabled — tokenizers load through the Rust BPE "
"fast path (~10x encode speedup)."
"fastokens enabled — tokenizers load through the Rust BPE fast path (~10x encode speedup)."
)
_FASTOKENS_ANNOUNCED = True
try:
Expand Down Expand Up @@ -1169,8 +1168,8 @@ def load_tokenizer(
def _populate_registry():
if RENDERER_REGISTRY:
return
from renderers.default import DefaultRenderer
from renderers.deepseek_v3 import DeepSeekV3Renderer
from renderers.default import DefaultRenderer
from renderers.glm5 import GLM5Renderer, GLM51Renderer
from renderers.glm45 import GLM45Renderer
from renderers.gpt_oss import GptOssRenderer
Expand Down Expand Up @@ -1271,8 +1270,7 @@ def create_renderer(
cls = RENDERER_REGISTRY.get(config.name)
if cls is None:
raise ValueError(
f"Unknown renderer {config.name!r}. "
f"Available: {', '.join(sorted(RENDERER_REGISTRY))}"
f"Unknown renderer {config.name!r}. Available: {', '.join(sorted(RENDERER_REGISTRY))}"
)
return cls(tokenizer, config)

Expand Down Expand Up @@ -1345,7 +1343,7 @@ def build_training_sample(
renderer: Renderer,
messages: list[Message],
*,
role_to_mask: Callable[[Message], bool],
role_to_mask: Callable[[Message], bool] | None = None,
tools: list[ToolSpec] | None = None,
content_sft_roles: "set[str] | frozenset[str] | None" = None,
) -> tuple[list[int], list[bool]]:
Expand All @@ -1354,15 +1352,31 @@ def build_training_sample(
Single render() call + message_indices → per-token mask.
Replaces build_incremental_token_mask (O(N) renders → O(1)).

When the renderer populates ``rendered.sampled_mask``, the loss mask
is the AND of role-based attribution and the sampled signal: only
tokens the model would have produced at inference are trainable.
This keeps SFT byte-aligned with the RL trajectory mask (where the
prompt / completion split achieves the same effect structurally).
When ``role_to_mask`` is omitted, ``loss_mask`` is the renderer's
``sampled_mask`` directly: every token the model would have
produced at inference is trainable, regardless of which message
it's attributed to. This is the recommended default for renderer
callers — the renderer owns the per-token "is this model output"
signal, so role-level filtering becomes a downstream constraint
rather than a precondition. (Some role markers — e.g. GLM
``<|user|>`` / ``<|observation|>`` after a tool-calling assistant
turn — *are* sampled by the model at inference and live inside the
next message's span; ``sampled_mask`` captures that, but a
naive role filter would mask them out.)

When ``role_to_mask`` is provided, ``loss_mask`` is the AND of the
role-based attribution and the sampled signal: only tokens the
model would have produced at inference AND attributed to a
trainable role pass through. Useful when the caller needs to
restrict training to a specific role (e.g. assistant-only) even on
a renderer whose ``sampled_mask`` already covers other roles.

Renderers that don't populate ``sampled_mask`` (empty list) fall
back to attribution-only masking — every token attributed to a
trainable role is trained on, including template-injected
``<|im_start|>role\\n`` openers.
``<|im_start|>role\\n`` openers. In this fallback mode
``role_to_mask`` is required; calling without it raises
``ValueError``.

``content_sft_roles`` opts in additional roles for "body-only"
supervision: for every message whose role is in this set, tokens
Expand Down Expand Up @@ -1393,6 +1407,13 @@ def build_training_sample(
else:
body_roles = frozenset()

if role_to_mask is None and not has_sampled_info:
raise ValueError(
"role_to_mask is required when the renderer does not populate "
"sampled_mask. Pass an explicit role filter (e.g. "
"lambda m: m['role'] == 'assistant') for this renderer."
)

loss_mask: list[bool] = []
for k, msg_idx in enumerate(rendered.message_indices):
if msg_idx < 0:
Expand All @@ -1408,6 +1429,11 @@ def build_training_sample(
continue
if has_sampled_info and not rendered.sampled_mask[k]:
loss_mask.append(False)
elif role_to_mask is None:
Comment thread
cursor[bot] marked this conversation as resolved.
# sampled_mask alone gates the loss when no role filter is
# supplied. ``sampled_mask[k]`` is True here (handled by the
# branch above), so this token is trainable.
loss_mask.append(True)
Comment thread
cursor[bot] marked this conversation as resolved.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stop tokens still excluded from loss

High Severity

The build_training_sample function incorrectly excludes renderer-generated scaffolding tokens (like GLM's <|observation|>) from the loss_mask. Despite being sampled_mask=True, these tokens are masked out when role_to_mask filters them or when content_sft_roles is active, as the latter prioritizes is_content (which is False for scaffolding). This prevents training on these critical tokens.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit cb07cc4. Configure here.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the natural consequence of GLM's chat-template design where the per-turn stop signal lives inside the next message's span — there is no dedicated <|im_end|>-style terminator inside the assistant turn itself. We have three knobs and they trade off:

  1. msg_idx attribution of <|observation|> / <|user|> after an assistant turn. We keep it attached to the next message (the tool/user message it opens) rather than retro-attributing to the assistant. The byte stream still ends the assistant's turn at that token, but structurally it belongs to the next message — which is what every downstream consumer (message_token_spans, role-based slicing, bridge/extension code) expects.
  2. is_sampled=True on that opener — added by this PR. Captures the fact that the model would sample that token at inference to close its turn. This is what makes the default training path (role_to_mask=None) train the model to emit the stop.
  3. is_content=False on that opener — preserved from before. The token is a role marker, not message body, so content_mask_for_roles({"tool"}) and content_token_spans_by_role() correctly exclude it.

The two specialized modes you flag fall out of this:

  • role_to_mask=lambda m: m["role"] == "assistant" — by construction, "only train on assistant-attributed tokens." Stop opener is attributed to the tool message → excluded. Working as specified; if the caller wants the stop trained too, they pass role_to_mask=None (default) and rely on the sampled-only path.
  • content_sft_roles={"tool"} — body-only SFT on tool responses, gated by is_content. Scaffold tokens (including the stop opener) are intentionally is_content=False to keep content_token_spans_by_role honest. In this mixed RL-on-assistant + SFT-on-tool-body mode, the assistant stop-signal supervision belongs to the RL trajectory, not to body-only SFT.

In short, the GLM chat template can't express "this token belongs to message N but counts as message N-1's output for training" cleanly, and the choice we made here (keep structural attribution, use is_sampled as the per-token "model would sample this" signal) is the cleanest fit with the existing build_training_sample contract. Default-mode training works correctly; the filter-mode misses are explicit user opt-ins, not silent bugs.

else:
loss_mask.append(role_to_mask(msg))
return rendered.token_ids, loss_mask
Expand Down
71 changes: 55 additions & 16 deletions renderers/glm45.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,22 @@ def emit_text_segments(
role = msg["role"]
content = self._visible_text(msg.get("content"))

# When the previous message is an assistant, this message's
# role-opening token (``<|user|>`` / ``<|observation|>``) is
# the inference-time stop signal that closes the assistant's
# turn (see ``get_stop_token_ids``). Mark it
# ``is_sampled=True`` so the loss-mask pipeline trains the
# model to emit it after ``</tool_call>`` (instead of
# continuing with another ``<tool_call>`` block). The token
# stays attributed to this message (msg_idx=i) and remains
# ``is_content=False`` — it's a role-marker / scaffold, not
# body bytes, so ``content_mask_for_roles({"tool"})`` and
# ``content_token_spans_by_role()`` correctly exclude it
# from "tool body" views. Byte stream is unchanged.
# ``system`` only appears at the start of a GLM conversation,
# so its opener is never the closer of an assistant turn.
closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant"

if role == "system":
emit_special(self._system, i, is_sampled=False, is_content=False)
# ``\n`` is the scaffold separator after the role tag;
Expand All @@ -193,7 +209,12 @@ def emit_text_segments(
)

elif role == "user":
emit_special(self._user, i, is_sampled=False, is_content=False)
emit_special(
self._user,
i,
is_sampled=closes_assistant_turn,
is_content=False,
)
# ``\n`` is scaffold; ``content`` is body; the optional
# ``/nothink`` suffix is scaffold the renderer injects
# when ``enable_thinking=False``.
Expand Down Expand Up @@ -362,6 +383,21 @@ def emit_text_segments(
ext_sampled.append(is_sampled)
ext_content.append(is_content)

# The opener-token of the first new_message may also serve as
# the close of the previous assistant turn (when the model
# failed to sample the stop token itself and the bridge has to
# synthesize the boundary above). Unlike :meth:`render`, the
# bridge emits these with ``is_sampled=False, is_content=False``
# — they are template scaffolding for the *next* step's prompt,
# not tokens the model produced *in this* step. The RL loss
# operates on ``previous_completion_ids`` (what the model
# actually sampled this round); bridge tokens belong to the
# subsequent prompt and must not be counted as "model output"
# by downstream mask consumers. This deliberate disagreement
# with ``render()`` reflects the SFT vs RL semantics: render's
# masks describe what the model *should* produce given a
# complete conversation; bridge's masks describe what it
# *actually* produced this step.
for i, msg in enumerate(new_messages):
role = msg.get("role")
content = self._visible_text(msg.get("content"))
Expand Down Expand Up @@ -531,21 +567,24 @@ def _render_tool(
emit_text,
emit_text_segments,
) -> None:
# Tool messages are conversation history injected by the runtime
# between assistant turns — the model never samples any of these
# tokens, so every emission is is_sampled=False. The body bytes
# get ``is_content=True``; the ``\n<tool_response>\n`` /
# ``\n</tool_response>`` wraps and the ``<|observation|>`` role
# tag are scaffold so the SFT mask for tool body never trains
# the model to emit them. Single BPE pass over the joined text
# preserves boundary merges (the tool body's leading/trailing
# chars can merge with the wrap's ``\n``s if the tokenizer would
# do so; we route through ``emit_text_segments`` so the
# attribution is offset-driven and tokenizer-agnostic).
prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"

if not prev_is_tool:
emit_special(self._observation, msg_idx, is_sampled=False, is_content=False)
# Tool body bytes get ``is_content=True``; the wraps are
# scaffold. The ``<|observation|>`` role tag is scaffold too
# (``is_content=False`` so ``content_mask_for_roles({"tool"})``
# excludes it). When the previous message is an assistant it
# doubles as the inference stop signal for that assistant's
# turn — mark it ``is_sampled=True`` so SFT trains the model to
# emit it after ``</tool_call>``. The token stays attributed to
# this tool message; byte stream is unchanged.
prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None
closes_assistant_turn = prev_role == "assistant"

if prev_role != "tool":
emit_special(
self._observation,
msg_idx,
is_sampled=closes_assistant_turn,
is_content=False,
)
Comment thread
cursor[bot] marked this conversation as resolved.

emit_text_segments(
[
Expand Down
66 changes: 55 additions & 11 deletions renderers/glm5.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,33 @@ def emit_text_segments(
role = msg["role"]
content = self._visible_text(msg.get("content"))

# When the previous message is an assistant, this message's
# role-opening token (``<|user|>`` / ``<|observation|>``) is
# the inference-time stop signal that closes the assistant's
# turn (see ``get_stop_token_ids``). Mark it
# ``is_sampled=True`` so the loss-mask pipeline trains the
# model to emit it after ``</tool_call>`` (instead of
# continuing with another ``<tool_call>`` block). The token
# stays attributed to this message (msg_idx=i) and remains
# ``is_content=False`` — it's a role-marker / scaffold, not
# body bytes, so ``content_mask_for_roles({"tool"})`` and
# ``content_token_spans_by_role()`` correctly exclude it
# from "tool body" views. Byte stream is unchanged.
# ``system`` only appears at the start of a GLM conversation,
# so its opener is never the closer of an assistant turn.
closes_assistant_turn = i > 0 and messages[i - 1]["role"] == "assistant"

if role == "system":
emit_special(self._system, i, is_sampled=False, is_content=False)
emit_text(content, i, is_sampled=False, is_content=True)

elif role == "user":
emit_special(self._user, i, is_sampled=False, is_content=False)
emit_special(
self._user,
i,
is_sampled=closes_assistant_turn,
is_content=False,
)
emit_text(content, i, is_sampled=False, is_content=True)

elif role == "assistant":
Expand Down Expand Up @@ -382,6 +403,21 @@ def emit_text_segments(
ext_sampled.append(is_sampled)
ext_content.append(is_content)

# The opener-token of the first new_message may also serve as
# the close of the previous assistant turn (when the model
# failed to sample the stop token itself and the bridge has to
# synthesize the boundary above). Unlike :meth:`render`, the
# bridge emits these with ``is_sampled=False, is_content=False``
# — they are template scaffolding for the *next* step's prompt,
# not tokens the model produced *in this* step. The RL loss
# operates on ``previous_completion_ids`` (what the model
# actually sampled this round); bridge tokens belong to the
# subsequent prompt and must not be counted as "model output"
# by downstream mask consumers. This deliberate disagreement
# with ``render()`` reflects the SFT vs RL semantics: render's
# masks describe what the model *should* produce given a
# complete conversation; bridge's masks describe what it
# *actually* produced this step.
for i, msg in enumerate(new_messages):
role = msg.get("role")
content = self._visible_text(msg.get("content"))
Expand Down Expand Up @@ -566,16 +602,24 @@ def _render_tool(
emit_text,
emit_text_segments,
) -> None:
# Tool messages are conversation history injected by the runtime
# between assistant turns — the model never samples any of these
# tokens, so every emission is is_sampled=False. The tool body
# bytes get ``is_content=True``; the ``<|observation|>`` /
# ``<tool_response>`` wraps are scaffold so the SFT mask for
# tool body never trains the model to emit them.
prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"

if not prev_is_tool:
emit_special(self._observation, msg_idx, is_sampled=False, is_content=False)
# Tool body bytes get ``is_content=True``; the wraps are
# scaffold. The ``<|observation|>`` role tag is scaffold too
# (``is_content=False`` so ``content_mask_for_roles({"tool"})``
# excludes it). When the previous message is an assistant it
# doubles as the inference stop signal for that assistant's
# turn — mark it ``is_sampled=True`` so SFT trains the model to
# emit it after ``</tool_call>``. The token stays attributed to
# this tool message; byte stream is unchanged.
prev_role = messages[msg_idx - 1]["role"] if msg_idx > 0 else None
closes_assistant_turn = prev_role == "assistant"

if prev_role != "tool":
emit_special(
self._observation,
msg_idx,
is_sampled=closes_assistant_turn,
is_content=False,
)

emit_special(
self._tool_response_tok, msg_idx, is_sampled=False, is_content=False
Expand Down
Loading