Skip to content

Commit df6478c

Browse files
committed
updates
1 parent 1649a5d commit df6478c

File tree

7 files changed

+709
-139
lines changed

7 files changed

+709
-139
lines changed

.github/workflows/chat-perf.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ on:
55
paths:
66
- '.github/workflows/chat-perf.yml'
77
schedule:
8-
# Every Friday at 12:00 AM PT (07:00 UTC)
9-
- cron: '0 7 * * 5'
8+
# Nightly at 12:00 AM PT (07:00 UTC)
9+
- cron: '0 7 * * *'
1010
workflow_dispatch:
1111
inputs:
1212
baseline_commit:
@@ -49,7 +49,7 @@ env:
4949

5050
jobs:
5151
chat-perf:
52-
name: Chat Perf – ${{ inputs.baseline_commit || 'config default' }} vs ${{ inputs.test_commit || github.sha }}
52+
name: Chat Perf
5353
runs-on: ubuntu-latest
5454
timeout-minutes: 120
5555
steps:

scripts/chat-simulation/common/mock-llm-server.js

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ function handleRequest(req, res) {
445445
tokenizer: 'o200k_base',
446446
limits: {
447447
max_prompt_tokens: 128000,
448-
max_output_tokens: 16384,
448+
max_output_tokens: 131072,
449449
max_context_window_tokens: 128000,
450450
},
451451
supports: {
@@ -472,7 +472,7 @@ function handleRequest(req, res) {
472472
tokenizer: 'o200k_base',
473473
limits: {
474474
max_prompt_tokens: 128000,
475-
max_output_tokens: 16384,
475+
max_output_tokens: 131072,
476476
max_context_window_tokens: 128000,
477477
},
478478
supports: {
@@ -508,7 +508,7 @@ function handleRequest(req, res) {
508508
type: 'chat',
509509
family: 'gpt-4o',
510510
tokenizer: 'o200k_base',
511-
limits: { max_prompt_tokens: 128000, max_output_tokens: 16384, max_context_window_tokens: 128000 },
511+
limits: { max_prompt_tokens: 128000, max_output_tokens: 131072, max_context_window_tokens: 128000 },
512512
supports: { streaming: true, tool_calls: true, parallel_tool_calls: true, vision: false },
513513
},
514514
});
@@ -599,20 +599,36 @@ const serverEvents = new EventEmitter();
599599
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
600600

601601
/**
602-
* Count the number of model turns already completed in the conversation.
603-
* A model turn is one of:
604-
* - An assistant message with tool_calls (tool-calls turn)
605-
* - An assistant message with content but no tool_calls (content/thinking turn)
606-
* The first assistant message after each user message counts as a new model
607-
* turn. User turns in the scenario are detected by counting user messages
608-
* beyond the initial one.
602+
* Count the number of model turns already completed for the CURRENT scenario.
603+
* Only counts assistant messages that appear after the last user message
604+
* containing a [scenario:X] tag. This prevents assistant messages from
605+
* previous scenarios (in the same chat session) from inflating the count.
606+
*
609607
* @param {any[]} messages
610608
* @returns {number}
611609
*/
612610
function countCompletedModelTurns(messages) {
611+
// Find the index of the last user message with a scenario tag
612+
let scenarioMsgIdx = -1;
613+
for (let i = messages.length - 1; i >= 0; i--) {
614+
const msg = messages[i];
615+
if (msg.role !== 'user') { continue; }
616+
const content = typeof msg.content === 'string'
617+
? msg.content
618+
: Array.isArray(msg.content)
619+
? msg.content.map((/** @type {any} */ c) => c.text || '').join('')
620+
: '';
621+
if (/\[scenario:[^\]]+\]/.test(content)) {
622+
scenarioMsgIdx = i;
623+
break;
624+
}
625+
}
626+
627+
// Count assistant messages after the scenario tag message
613628
let turns = 0;
614-
for (const msg of messages) {
615-
if (msg.role === 'assistant') {
629+
const startIdx = scenarioMsgIdx >= 0 ? scenarioMsgIdx + 1 : 0;
630+
for (let i = startIdx; i < messages.length; i++) {
631+
if (messages[i].role === 'assistant') {
616632
turns++;
617633
}
618634
}
@@ -680,9 +696,14 @@ async function handleChatCompletions(body, res) {
680696
console.log(`[mock-llm] ${ts}${requestToolNames.length} tools available: ${requestToolNames.join(', ')}`);
681697
}
682698

683-
// Search all user messages for the scenario tag (not just the last one,
684-
// since follow-up user messages in multi-turn scenarios won't have it).
685-
for (const msg of messages) {
699+
// Search user messages in reverse order (newest first) for the scenario
700+
// tag. This ensures the most recent message's tag takes precedence when
701+
// multiple messages with different tags exist in the same conversation
702+
// (e.g. in the leak checker which sends many scenarios in one session).
703+
// Follow-up user messages in multi-turn scenarios won't have a tag, so
704+
// searching backwards still finds the correct tag from the initial message.
705+
for (let mi = messages.length - 1; mi >= 0; mi--) {
706+
const msg = messages[mi];
686707
if (msg.role !== 'user') { continue; }
687708
const content = typeof msg.content === 'string'
688709
? msg.content

scripts/chat-simulation/common/perf-scenarios.js

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,113 @@ const TOOL_CALL_SCENARIOS = {
357357
],
358358
};
359359
})()),
360+
361+
// Terminal tool: run commands, read output, run more commands.
362+
// Simulates an agent installing dependencies, running tests, and
363+
// diagnosing failures — a common agentic workflow.
364+
'tool-terminal': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ({
365+
type: 'multi-turn',
366+
turns: [
367+
// Round 1: run initial commands (install + build)
368+
{
369+
kind: 'tool-calls',
370+
toolCalls: [
371+
{
372+
toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i,
373+
arguments: {
374+
command: 'echo "Installing dependencies..." && echo "added 1631 packages in 6m"',
375+
explanation: 'Install project dependencies',
376+
goal: 'Install dependencies',
377+
mode: 'sync',
378+
timeout: 30000,
379+
},
380+
},
381+
],
382+
},
383+
// Round 2: run test command
384+
{
385+
kind: 'tool-calls',
386+
toolCalls: [
387+
{
388+
toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i,
389+
arguments: {
390+
command: 'echo "Running unit tests..." && echo " 42 passing (3s)" && echo " 2 failing" && echo "" && echo " 1) ChatService should dispose listeners" && echo " AssertionError: expected 0 to equal 1" && echo " 2) ChatModel should clear on new session" && echo " TypeError: Cannot read property dispose of undefined"',
391+
explanation: 'Run the unit test suite to check for failures',
392+
goal: 'Run tests',
393+
mode: 'sync',
394+
timeout: 60000,
395+
},
396+
},
397+
],
398+
},
399+
// Round 3: read the failing test file for context
400+
{
401+
kind: 'tool-calls',
402+
toolCalls: [
403+
{
404+
toolNamePattern: /read.?file/i,
405+
arguments: { filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), startLine: 1, endLine: 50 },
406+
},
407+
],
408+
},
409+
// Round 4: fix the issue with an edit
410+
{
411+
kind: 'tool-calls',
412+
toolCalls: [
413+
{
414+
toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i,
415+
arguments: {
416+
filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'),
417+
oldString: '// perf-benchmark-marker',
418+
newString: '// perf-benchmark-marker (fixed)',
419+
explanation: 'Fix the dispose call in the test',
420+
},
421+
},
422+
],
423+
},
424+
// Round 5: re-run tests to confirm
425+
{
426+
kind: 'tool-calls',
427+
toolCalls: [
428+
{
429+
toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i,
430+
arguments: {
431+
command: 'echo "Running unit tests..." && echo " 44 passing (3s)" && echo " 0 failing"',
432+
explanation: 'Re-run tests to verify the fix',
433+
goal: 'Verify fix',
434+
mode: 'sync',
435+
timeout: 60000,
436+
},
437+
},
438+
],
439+
},
440+
// Round 6: final summary
441+
{
442+
kind: 'content',
443+
chunks: new ScenarioBuilder()
444+
.wait(20, '## Test Failures Fixed\n\n')
445+
.stream([
446+
'I found and fixed 2 test failures:\n\n',
447+
'### Root Cause\n',
448+
'The `ChatService` was not properly disposing event listeners when a session was cleared. ',
449+
'The `dispose()` method was missing a call to `this._store.dispose()`.\n\n',
450+
'### Changes Made\n',
451+
'Updated `lifecycle.ts` to properly chain disposal:\n\n',
452+
'```typescript\n',
453+
'override dispose(): void {\n',
454+
' this._store.dispose();\n',
455+
' super.dispose();\n',
456+
'}\n',
457+
'```\n\n',
458+
'### Test Results\n',
459+
'- **Before**: 42 passing, 2 failing\n',
460+
'- **After**: 44 passing, 0 failing\n\n',
461+
'All tests pass now. The fix ensures listeners are cleaned up during session transitions.\n',
462+
], 15)
463+
.build(),
464+
},
465+
],
466+
}),
360467
};
361468

362469
// -- Multi-turn user conversation scenarios -----------------------------------

scripts/chat-simulation/common/utils.js

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ function buildEnv(mockServer, { isDevBuild = true } = {}) {
193193
* @param {string} logsDir
194194
* @returns {string[]}
195195
*/
196-
function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true } = {}) {
196+
function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true, extHostInspectPort = 0 } = {}) {
197197
const args = [
198198
ROOT,
199199
'--skip-release-notes',
@@ -213,6 +213,13 @@ function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true } = {}) {
213213
if (process.platform !== 'darwin') {
214214
args.push('--disable-gpu');
215215
}
216+
if (process.env.CI && process.platform === 'linux') {
217+
args.push('--no-sandbox');
218+
}
219+
// Enable extension host inspector for profiling/heap snapshots
220+
if (extHostInspectPort > 0) {
221+
args.push(`--inspect-extensions=${extHostInspectPort}`);
222+
}
216223
return args;
217224
}
218225

@@ -228,6 +235,8 @@ function writeSettings(userDataDir, mockServer) {
228235
'github.copilot.advanced.debug.overrideProxyUrl': mockServer.url,
229236
'github.copilot.advanced.debug.overrideCapiUrl': mockServer.url,
230237
'chat.allowAnonymousAccess': true,
238+
// Start new chat sessions in agent mode so tools are available.
239+
'chat.newSession.defaultMode': 'agent',
231240
// Disable MCP servers — they start async and add unpredictable
232241
// delay that pollutes perf measurements.
233242
'chat.mcp.discovery.enabled': false,
@@ -275,6 +284,112 @@ function prepareRunDir(runId, mockServer) {
275284

276285
// -- VS Code launch via CDP --------------------------------------------------
277286

287+
// -- Extension host inspector ------------------------------------------------
288+
289+
/** @type {number} */
290+
let nextExtHostPort = 29222;
291+
292+
/** @returns {number} */
293+
function getNextExtHostInspectPort() {
294+
return nextExtHostPort++;
295+
}
296+
297+
/**
298+
* Connect to the extension host's Node inspector via WebSocket.
299+
* The extension host must be started with `--inspect-extensions=<port>`.
300+
*
301+
* @param {number} port
302+
* @param {{ verbose?: boolean, timeoutMs?: number }} [opts]
303+
* @returns {Promise<{ send: (method: string, params?: any) => Promise<any>, on: (event: string, listener: (params: any) => void) => void, close: () => void, port: number }>}
304+
*/
305+
async function connectToExtHostInspector(port, opts = {}) {
306+
const { verbose = false, timeoutMs = 30_000 } = opts;
307+
308+
// Wait for the inspector endpoint to be available
309+
const deadline = Date.now() + timeoutMs;
310+
/** @type {any} */
311+
let wsUrl;
312+
while (Date.now() < deadline) {
313+
try {
314+
const targets = await getJson(`http://127.0.0.1:${port}/json`);
315+
if (targets.length > 0 && targets[0].webSocketDebuggerUrl) {
316+
wsUrl = targets[0].webSocketDebuggerUrl;
317+
break;
318+
}
319+
} catch { }
320+
await new Promise(r => setTimeout(r, 500));
321+
}
322+
if (!wsUrl) {
323+
throw new Error(`Timed out waiting for extension host inspector on port ${port}`);
324+
}
325+
326+
if (verbose) {
327+
console.log(` [ext-host] Connected to inspector: ${wsUrl}`);
328+
}
329+
330+
const WebSocket = require('ws');
331+
const ws = new WebSocket(wsUrl);
332+
await new Promise((resolve, reject) => {
333+
ws.once('open', resolve);
334+
ws.once('error', reject);
335+
});
336+
337+
let msgId = 1;
338+
/** @type {Map<number, { resolve: (v: any) => void, reject: (e: Error) => void }>} */
339+
const pending = new Map();
340+
/** @type {Map<string, ((params: any) => void)[]>} */
341+
const eventListeners = new Map();
342+
343+
ws.on('message', (/** @type {Buffer} */ data) => {
344+
const msg = JSON.parse(data.toString());
345+
if (msg.id !== undefined) {
346+
const p = pending.get(msg.id);
347+
if (p) {
348+
pending.delete(msg.id);
349+
if (msg.error) { p.reject(new Error(msg.error.message)); }
350+
else { p.resolve(msg.result); }
351+
}
352+
} else if (msg.method) {
353+
const listeners = eventListeners.get(msg.method) || [];
354+
for (const listener of listeners) { listener(msg.params); }
355+
}
356+
});
357+
358+
return {
359+
port,
360+
/**
361+
* @param {string} method
362+
* @param {any} [params]
363+
* @returns {Promise<any>}
364+
*/
365+
send(method, params) {
366+
return new Promise((resolve, reject) => {
367+
const id = msgId++;
368+
pending.set(id, { resolve, reject });
369+
ws.send(JSON.stringify({ id, method, params }));
370+
setTimeout(() => {
371+
if (pending.has(id)) {
372+
pending.delete(id);
373+
reject(new Error(`Inspector call timed out: ${method}`));
374+
}
375+
}, 30_000);
376+
});
377+
},
378+
/**
379+
* @param {string} event
380+
* @param {(params: any) => void} listener
381+
*/
382+
on(event, listener) {
383+
const list = eventListeners.get(event) || [];
384+
list.push(listener);
385+
eventListeners.set(event, list);
386+
},
387+
close() {
388+
ws.close();
389+
},
390+
};
391+
}
392+
278393
/**
279394
* Fetch JSON from a URL. Used to probe the CDP endpoint.
280395
* @param {string} url
@@ -647,6 +762,10 @@ const METRIC_DEFS = [
647762
['frameCount', 'rendering', ''],
648763
['compositeLayers', 'rendering', ''],
649764
['paintCount', 'rendering', ''],
765+
['extHostHeapUsedBefore', 'extHost', 'MB'],
766+
['extHostHeapUsedAfter', 'extHost', 'MB'],
767+
['extHostHeapDelta', 'extHost', 'MB'],
768+
['extHostHeapDeltaPostGC', 'extHost', 'MB'],
650769
];
651770

652771
module.exports = {
@@ -670,4 +789,6 @@ module.exports = {
670789
summarize,
671790
markDuration,
672791
launchVSCode,
792+
getNextExtHostInspectPort,
793+
connectToExtHostInspector,
673794
};

0 commit comments

Comments
 (0)