microsoft
diff --git a/‎.github/workflows/chat-perf.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/chat-perf.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎scripts/chat-simulation/common/mock-llm-server.js‎
Lines changed: 36 additions & 15 deletions b/‎scripts/chat-simulation/common/mock-llm-server.js‎
Lines changed: 36 additions & 15 deletions
diff --git a/‎scripts/chat-simulation/common/perf-scenarios.js‎
Lines changed: 107 additions & 0 deletions b/‎scripts/chat-simulation/common/perf-scenarios.js‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎scripts/chat-simulation/common/utils.js‎
Lines changed: 122 additions & 1 deletion b/‎scripts/chat-simulation/common/utils.js‎
Lines changed: 122 additions & 1 deletion
@@ -5,8 +5,8 @@ on:
     paths:
       - '.github/workflows/chat-perf.yml'
   schedule:
-    # Every Friday at 12:00 AM PT (07:00 UTC)
-    - cron: '0 7 * * 5'
+    # Nightly at 12:00 AM PT (07:00 UTC)
+    - cron: '0 7 * * *'
   workflow_dispatch:
     inputs:
       baseline_commit:
@@ -49,7 +49,7 @@ env:
 
 jobs:
   chat-perf:
-    name: Chat Perf – ${{ inputs.baseline_commit || 'config default' }} vs ${{ inputs.test_commit || github.sha }}
+    name: Chat Perf
     runs-on: ubuntu-latest
     timeout-minutes: 120
     steps:
 
@@ -445,7 +445,7 @@ function handleRequest(req, res) {
 						tokenizer: 'o200k_base',
 						limits: {
 							max_prompt_tokens: 128000,
-							max_output_tokens: 16384,
+							max_output_tokens: 131072,
 							max_context_window_tokens: 128000,
 						},
 						supports: {
@@ -472,7 +472,7 @@ function handleRequest(req, res) {
 						tokenizer: 'o200k_base',
 						limits: {
 							max_prompt_tokens: 128000,
-							max_output_tokens: 16384,
+							max_output_tokens: 131072,
 							max_context_window_tokens: 128000,
 						},
 						supports: {
@@ -508,7 +508,7 @@ function handleRequest(req, res) {
 				type: 'chat',
 				family: 'gpt-4o',
 				tokenizer: 'o200k_base',
-				limits: { max_prompt_tokens: 128000, max_output_tokens: 16384, max_context_window_tokens: 128000 },
+				limits: { max_prompt_tokens: 128000, max_output_tokens: 131072, max_context_window_tokens: 128000 },
 				supports: { streaming: true, tool_calls: true, parallel_tool_calls: true, vision: false },
 			},
 		});
@@ -599,20 +599,36 @@ const serverEvents = new EventEmitter();
 const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
 
 /**
- * Count the number of model turns already completed in the conversation.
- * A model turn is one of:
- *   - An assistant message with tool_calls (tool-calls turn)
- *   - An assistant message with content but no tool_calls (content/thinking turn)
- * The first assistant message after each user message counts as a new model
- * turn. User turns in the scenario are detected by counting user messages
- * beyond the initial one.
+ * Count the number of model turns already completed for the CURRENT scenario.
+ * Only counts assistant messages that appear after the last user message
+ * containing a [scenario:X] tag. This prevents assistant messages from
+ * previous scenarios (in the same chat session) from inflating the count.
+ *
  * @param {any[]} messages
  * @returns {number}
  */
 function countCompletedModelTurns(messages) {
+	// Find the index of the last user message with a scenario tag
+	let scenarioMsgIdx = -1;
+	for (let i = messages.length - 1; i >= 0; i--) {
+		const msg = messages[i];
+		if (msg.role !== 'user') { continue; }
+		const content = typeof msg.content === 'string'
+			? msg.content
+			: Array.isArray(msg.content)
+				? msg.content.map((/** @type {any} */ c) => c.text || '').join('')
+				: '';
+		if (/\[scenario:[^\]]+\]/.test(content)) {
+			scenarioMsgIdx = i;
+			break;
+		}
+	}
+
+	// Count assistant messages after the scenario tag message
 	let turns = 0;
-	for (const msg of messages) {
-		if (msg.role === 'assistant') {
+	const startIdx = scenarioMsgIdx >= 0 ? scenarioMsgIdx + 1 : 0;
+	for (let i = startIdx; i < messages.length; i++) {
+		if (messages[i].role === 'assistant') {
 			turns++;
 		}
 	}
@@ -680,9 +696,14 @@ async function handleChatCompletions(body, res) {
 			console.log(`[mock-llm]   ${ts} → ${requestToolNames.length} tools available: ${requestToolNames.join(', ')}`);
 		}
 
-		// Search all user messages for the scenario tag (not just the last one,
-		// since follow-up user messages in multi-turn scenarios won't have it).
-		for (const msg of messages) {
+		// Search user messages in reverse order (newest first) for the scenario
+		// tag. This ensures the most recent message's tag takes precedence when
+		// multiple messages with different tags exist in the same conversation
+		// (e.g. in the leak checker which sends many scenarios in one session).
+		// Follow-up user messages in multi-turn scenarios won't have a tag, so
+		// searching backwards still finds the correct tag from the initial message.
+		for (let mi = messages.length - 1; mi >= 0; mi--) {
+			const msg = messages[mi];
 			if (msg.role !== 'user') { continue; }
 			const content = typeof msg.content === 'string'
 				? msg.content
 
@@ -357,6 +357,113 @@ const TOOL_CALL_SCENARIOS = {
 			],
 		};
 	})()),
+
+	// Terminal tool: run commands, read output, run more commands.
+	// Simulates an agent installing dependencies, running tests, and
+	// diagnosing failures — a common agentic workflow.
+	'tool-terminal': /** @type {import('./mock-llm-server').MultiTurnScenario} */ ({
+		type: 'multi-turn',
+		turns: [
+			// Round 1: run initial commands (install + build)
+			{
+				kind: 'tool-calls',
+				toolCalls: [
+					{
+						toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i,
+						arguments: {
+							command: 'echo "Installing dependencies..." && echo "added 1631 packages in 6m"',
+							explanation: 'Install project dependencies',
+							goal: 'Install dependencies',
+							mode: 'sync',
+							timeout: 30000,
+						},
+					},
+				],
+			},
+			// Round 2: run test command
+			{
+				kind: 'tool-calls',
+				toolCalls: [
+					{
+						toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i,
+						arguments: {
+							command: 'echo "Running unit tests..." && echo "  42 passing (3s)" && echo "  2 failing" && echo "" && echo "  1) ChatService should dispose listeners" && echo "     AssertionError: expected 0 to equal 1" && echo "  2) ChatModel should clear on new session" && echo "     TypeError: Cannot read property dispose of undefined"',
+							explanation: 'Run the unit test suite to check for failures',
+							goal: 'Run tests',
+							mode: 'sync',
+							timeout: 60000,
+						},
+					},
+				],
+			},
+			// Round 3: read the failing test file for context
+			{
+				kind: 'tool-calls',
+				toolCalls: [
+					{
+						toolNamePattern: /read.?file/i,
+						arguments: { filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'), startLine: 1, endLine: 50 },
+					},
+				],
+			},
+			// Round 4: fix the issue with an edit
+			{
+				kind: 'tool-calls',
+				toolCalls: [
+					{
+						toolNamePattern: /replace.?string|apply.?patch|insert.?edit/i,
+						arguments: {
+							filePath: path.join(FIXTURES_DIR, 'lifecycle.ts'),
+							oldString: '// perf-benchmark-marker',
+							newString: '// perf-benchmark-marker (fixed)',
+							explanation: 'Fix the dispose call in the test',
+						},
+					},
+				],
+			},
+			// Round 5: re-run tests to confirm
+			{
+				kind: 'tool-calls',
+				toolCalls: [
+					{
+						toolNamePattern: /run.?in.?terminal|execute.?command|terminal/i,
+						arguments: {
+							command: 'echo "Running unit tests..." && echo "  44 passing (3s)" && echo "  0 failing"',
+							explanation: 'Re-run tests to verify the fix',
+							goal: 'Verify fix',
+							mode: 'sync',
+							timeout: 60000,
+						},
+					},
+				],
+			},
+			// Round 6: final summary
+			{
+				kind: 'content',
+				chunks: new ScenarioBuilder()
+					.wait(20, '## Test Failures Fixed\n\n')
+					.stream([
+						'I found and fixed 2 test failures:\n\n',
+						'### Root Cause\n',
+						'The `ChatService` was not properly disposing event listeners when a session was cleared. ',
+						'The `dispose()` method was missing a call to `this._store.dispose()`.\n\n',
+						'### Changes Made\n',
+						'Updated `lifecycle.ts` to properly chain disposal:\n\n',
+						'```typescript\n',
+						'override dispose(): void {\n',
+						'  this._store.dispose();\n',
+						'  super.dispose();\n',
+						'}\n',
+						'```\n\n',
+						'### Test Results\n',
+						'- **Before**: 42 passing, 2 failing\n',
+						'- **After**: 44 passing, 0 failing\n\n',
+						'All tests pass now. The fix ensures listeners are cleaned up during session transitions.\n',
+					], 15)
+					.build(),
+			},
+		],
+	}),
 };
 
 // -- Multi-turn user conversation scenarios -----------------------------------
 
@@ -193,7 +193,7 @@ function buildEnv(mockServer, { isDevBuild = true } = {}) {
  * @param {string} logsDir
  * @returns {string[]}
  */
-function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true } = {}) {
+function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true, extHostInspectPort = 0 } = {}) {
 	const args = [
 		ROOT,
 		'--skip-release-notes',
@@ -213,6 +213,13 @@ function buildArgs(userDataDir, extDir, logsDir, { isDevBuild = true } = {}) {
 	if (process.platform !== 'darwin') {
 		args.push('--disable-gpu');
 	}
+	if (process.env.CI && process.platform === 'linux') {
+		args.push('--no-sandbox');
+	}
+	// Enable extension host inspector for profiling/heap snapshots
+	if (extHostInspectPort > 0) {
+		args.push(`--inspect-extensions=${extHostInspectPort}`);
+	}
 	return args;
 }
 
@@ -228,6 +235,8 @@ function writeSettings(userDataDir, mockServer) {
 		'github.copilot.advanced.debug.overrideProxyUrl': mockServer.url,
 		'github.copilot.advanced.debug.overrideCapiUrl': mockServer.url,
 		'chat.allowAnonymousAccess': true,
+		// Start new chat sessions in agent mode so tools are available.
+		'chat.newSession.defaultMode': 'agent',
 		// Disable MCP servers — they start async and add unpredictable
 		// delay that pollutes perf measurements.
 		'chat.mcp.discovery.enabled': false,
@@ -275,6 +284,112 @@ function prepareRunDir(runId, mockServer) {
 
 // -- VS Code launch via CDP --------------------------------------------------
 
+// -- Extension host inspector ------------------------------------------------
+
+/** @type {number} */
+let nextExtHostPort = 29222;
+
+/** @returns {number} */
+function getNextExtHostInspectPort() {
+	return nextExtHostPort++;
+}
+
+/**
+ * Connect to the extension host's Node inspector via WebSocket.
+ * The extension host must be started with `--inspect-extensions=<port>`.
+ *
+ * @param {number} port
+ * @param {{ verbose?: boolean, timeoutMs?: number }} [opts]
+ * @returns {Promise<{ send: (method: string, params?: any) => Promise<any>, on: (event: string, listener: (params: any) => void) => void, close: () => void, port: number }>}
+ */
+async function connectToExtHostInspector(port, opts = {}) {
+	const { verbose = false, timeoutMs = 30_000 } = opts;
+
+	// Wait for the inspector endpoint to be available
+	const deadline = Date.now() + timeoutMs;
+	/** @type {any} */
+	let wsUrl;
+	while (Date.now() < deadline) {
+		try {
+			const targets = await getJson(`http://127.0.0.1:${port}/json`);
+			if (targets.length > 0 && targets[0].webSocketDebuggerUrl) {
+				wsUrl = targets[0].webSocketDebuggerUrl;
+				break;
+			}
+		} catch { }
+		await new Promise(r => setTimeout(r, 500));
+	}
+	if (!wsUrl) {
+		throw new Error(`Timed out waiting for extension host inspector on port ${port}`);
+	}
+
+	if (verbose) {
+		console.log(`  [ext-host] Connected to inspector: ${wsUrl}`);
+	}
+
+	const WebSocket = require('ws');
+	const ws = new WebSocket(wsUrl);
+	await new Promise((resolve, reject) => {
+		ws.once('open', resolve);
+		ws.once('error', reject);
+	});
+
+	let msgId = 1;
+	/** @type {Map<number, { resolve: (v: any) => void, reject: (e: Error) => void }>} */
+	const pending = new Map();
+	/** @type {Map<string, ((params: any) => void)[]>} */
+	const eventListeners = new Map();
+
+	ws.on('message', (/** @type {Buffer} */ data) => {
+		const msg = JSON.parse(data.toString());
+		if (msg.id !== undefined) {
+			const p = pending.get(msg.id);
+			if (p) {
+				pending.delete(msg.id);
+				if (msg.error) { p.reject(new Error(msg.error.message)); }
+				else { p.resolve(msg.result); }
+			}
+		} else if (msg.method) {
+			const listeners = eventListeners.get(msg.method) || [];
+			for (const listener of listeners) { listener(msg.params); }
+		}
+	});
+
+	return {
+		port,
+		/**
+		 * @param {string} method
+		 * @param {any} [params]
+		 * @returns {Promise<any>}
+		 */
+		send(method, params) {
+			return new Promise((resolve, reject) => {
+				const id = msgId++;
+				pending.set(id, { resolve, reject });
+				ws.send(JSON.stringify({ id, method, params }));
+				setTimeout(() => {
+					if (pending.has(id)) {
+						pending.delete(id);
+						reject(new Error(`Inspector call timed out: ${method}`));
+					}
+				}, 30_000);
+			});
+		},
+		/**
+		 * @param {string} event
+		 * @param {(params: any) => void} listener
+		 */
+		on(event, listener) {
+			const list = eventListeners.get(event) || [];
+			list.push(listener);
+			eventListeners.set(event, list);
+		},
+		close() {
+			ws.close();
+		},
+	};
+}
+
 /**
  * Fetch JSON from a URL. Used to probe the CDP endpoint.
  * @param {string} url
@@ -647,6 +762,10 @@ const METRIC_DEFS = [
 	['frameCount', 'rendering', ''],
 	['compositeLayers', 'rendering', ''],
 	['paintCount', 'rendering', ''],
+	['extHostHeapUsedBefore', 'extHost', 'MB'],
+	['extHostHeapUsedAfter', 'extHost', 'MB'],
+	['extHostHeapDelta', 'extHost', 'MB'],
+	['extHostHeapDeltaPostGC', 'extHost', 'MB'],
 ];
 
 module.exports = {
@@ -670,4 +789,6 @@ module.exports = {
 	summarize,
 	markDuration,
 	launchVSCode,
+	getNextExtHostInspectPort,
+	connectToExtHostInspector,
 };