Skip to content

Commit 911ecce

Browse files
Add e2e tests for empty-mode systemMessage defaults
Three deterministic tests using sendAndWait + element-name instructions: - default: env_context stripped (ARGON) - replace: caller content used verbatim (KRYPTON) - append: caller instruction applied and env_context still stripped (XENON) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent d58befa commit 911ecce

4 files changed

Lines changed: 115 additions & 0 deletions

nodejs/test/e2e/mode_empty.e2e.test.ts

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,28 @@ describe("Mode = empty + ToolSet patterns", async () => {
4444
);
4545
}
4646

47+
async function getSystemMessageSentToLLM(): Promise<string> {
48+
await retry(
49+
"capture chat completion request",
50+
async () => {
51+
const exchanges = await openAiEndpoint.getExchanges();
52+
expect(exchanges.length).toBeGreaterThanOrEqual(1);
53+
},
54+
1_200
55+
);
56+
const exchanges = await openAiEndpoint.getExchanges();
57+
const messages = exchanges[exchanges.length - 1].request.messages ?? [];
58+
const sys = messages.find((m) => m.role === "system");
59+
const content = sys?.content;
60+
if (typeof content === "string") return content;
61+
if (Array.isArray(content)) {
62+
return content
63+
.map((p) => (typeof p === "object" && p && "text" in p ? p.text : ""))
64+
.join("\n");
65+
}
66+
return "";
67+
}
68+
4769
it("empty mode + Isolated set: shell tool is NOT exposed", async () => {
4870
const session = await client.createSession({
4971
onPermissionRequest: approveAll,
@@ -98,4 +120,67 @@ describe("Mode = empty + ToolSet patterns", async () => {
98120

99121
await session.disconnect();
100122
});
123+
124+
it("empty mode strips environment_context from the system message by default", async () => {
125+
// We can't directly observe section presence, but we can detect it
126+
// indirectly: in default empty mode the SDK injects the customize-mode
127+
// override `environment_context: { action: "remove" }`. We also append
128+
// a deterministic instruction. If the env_context strip didn't fire,
129+
// the runtime would still inject OS/cwd lines into the system message
130+
// and the model would be free to mention them; with the strip in place
131+
// the model has no env info to lean on and follows our instruction.
132+
const session = await client.createSession({
133+
onPermissionRequest: approveAll,
134+
availableTools: new ToolSet().addBuiltIn(BuiltInTools.Isolated),
135+
systemMessage: {
136+
mode: "customize",
137+
content:
138+
"If the user asks you to name an element, reply with exactly the single word ARGON in all caps and nothing else.",
139+
},
140+
});
141+
const reply = await session.sendAndWait({ prompt: "Name an element." });
142+
expect(reply?.data.content).toContain("ARGON");
143+
144+
const systemMessage = await getSystemMessageSentToLLM();
145+
expect(systemMessage).not.toMatch(/Current working directory:/i);
146+
expect(systemMessage).not.toMatch(/Operating System:/i);
147+
148+
await session.disconnect();
149+
});
150+
151+
it("empty mode + systemMessage replace: LLM follows caller's content verbatim", async () => {
152+
const session = await client.createSession({
153+
onPermissionRequest: approveAll,
154+
availableTools: new ToolSet().addBuiltIn(BuiltInTools.Isolated),
155+
systemMessage: {
156+
mode: "replace",
157+
content:
158+
"You are a test fixture. Whenever the user asks anything, reply with exactly the single word KRYPTON in all caps and nothing else.",
159+
},
160+
});
161+
const reply = await session.sendAndWait({ prompt: "Hello." });
162+
expect(reply?.data.content).toContain("KRYPTON");
163+
164+
await session.disconnect();
165+
});
166+
167+
it("empty mode + append: caller's instruction takes effect and env_context is still stripped", async () => {
168+
const session = await client.createSession({
169+
onPermissionRequest: approveAll,
170+
availableTools: new ToolSet().addBuiltIn(BuiltInTools.Isolated),
171+
systemMessage: {
172+
mode: "append",
173+
content:
174+
"If the user asks you to name a noble gas, reply with exactly the single word XENON in all caps and nothing else.",
175+
},
176+
});
177+
const reply = await session.sendAndWait({ prompt: "Name a noble gas." });
178+
expect(reply?.data.content).toContain("XENON");
179+
180+
const systemMessage = await getSystemMessageSentToLLM();
181+
expect(systemMessage).not.toMatch(/Current working directory:/i);
182+
expect(systemMessage).not.toMatch(/Operating System:/i);
183+
184+
await session.disconnect();
185+
});
101186
});
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
models:
2+
- claude-sonnet-4.5
3+
conversations:
4+
- messages:
5+
- role: system
6+
content: ${system}
7+
- role: user
8+
content: Name a noble gas.
9+
- role: assistant
10+
content: XENON
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
models:
2+
- claude-sonnet-4.5
3+
conversations:
4+
- messages:
5+
- role: system
6+
content: ${system}
7+
- role: user
8+
content: Hello.
9+
- role: assistant
10+
content: KRYPTON
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
models:
2+
- claude-sonnet-4.5
3+
conversations:
4+
- messages:
5+
- role: system
6+
content: ${system}
7+
- role: user
8+
content: Name an element.
9+
- role: assistant
10+
content: ARGON

0 commit comments

Comments
 (0)