Skip to content

Commit 2773978

Browse files
committed
update
1 parent 1863d3b commit 2773978

File tree

2 files changed

+106
-7
lines changed

2 files changed

+106
-7
lines changed

scripts/chat-simulation/config.jsonc

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,23 @@
66
// Number of benchmark iterations per scenario
77
"runsPerScenario": 5,
88

9-
// Fraction above baseline that triggers a regression (0.2 = 20%)
10-
"regressionThreshold": 0.2
9+
// Default fraction above baseline that triggers a regression (0.2 = 20%).
10+
// Per-metric overrides below take precedence when set.
11+
"regressionThreshold": 0.2,
12+
13+
// Per-metric regression thresholds.
14+
// - A plain number (0-1) is a fraction, e.g. 0.2 = 20% above baseline.
15+
// - A string ending in the metric's unit (e.g. "100ms") is an absolute delta.
16+
// Metrics not listed here use regressionThreshold above.
17+
"metricThresholds": {
18+
"timeToFirstToken": "100ms",
19+
"timeToComplete": 0.2,
20+
"layoutCount": 0.2,
21+
"recalcStyleCount": 0.2,
22+
"forcedReflowCount": 0.2,
23+
"longTaskCount": 0.2,
24+
"longAnimationFrameCount": 0.2
25+
}
1126
},
1227
"memLeaks": {
1328
// Number of open→work→reset cycles

scripts/chat-simulation/test-chat-perf-regression.js

Lines changed: 89 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ function parseArgs() {
5555
baselineBuild: CONFIG.baselineBuild ?? '1.115.0',
5656
saveBaseline: false,
5757
threshold: CONFIG.regressionThreshold ?? 0.2,
58+
/** @type {Record<string, number | string>} */
59+
metricThresholds: CONFIG.metricThresholds ?? {},
5860
/** @type {string | undefined} */
5961
resume: undefined,
6062
};
@@ -104,6 +106,71 @@ function parseArgs() {
104106
return opts;
105107
}
106108

109+
/**
110+
* @typedef {{ type: 'fraction', value: number } | { type: 'absolute', value: number }} MetricThreshold
111+
*/
112+
113+
/**
114+
* Parse a metric threshold value from config.
115+
* - A number is treated as a fraction (e.g. 0.2 = 20%).
116+
* - A string like "100ms" or "5" is treated as an absolute delta.
117+
* @param {number | string} raw
118+
* @returns {MetricThreshold}
119+
*/
120+
function parseMetricThreshold(raw) {
121+
if (typeof raw === 'number') {
122+
return { type: 'fraction', value: raw };
123+
}
124+
// Strip unit suffix (ms, MB, etc.) and parse the number
125+
const num = parseFloat(raw);
126+
if (isNaN(num)) {
127+
throw new Error(`Invalid metric threshold: ${raw}`);
128+
}
129+
return { type: 'absolute', value: num };
130+
}
131+
132+
/**
133+
* Get the regression threshold for a specific metric.
134+
* Uses per-metric override from config if available, otherwise the global threshold.
135+
* @param {ReturnType<typeof parseArgs>} opts
136+
* @param {string} metric
137+
* @returns {MetricThreshold}
138+
*/
139+
function getMetricThreshold(opts, metric) {
140+
const raw = opts.metricThresholds[metric];
141+
if (raw !== undefined) {
142+
return parseMetricThreshold(raw);
143+
}
144+
return { type: 'fraction', value: opts.threshold };
145+
}
146+
147+
/**
148+
* Check whether a change exceeds the threshold.
149+
* @param {MetricThreshold} threshold
150+
* @param {number} change - fractional change (e.g. 0.5 = 50% increase)
151+
* @param {number} absoluteDelta - absolute difference (cur.median - bas.median)
152+
* @returns {boolean}
153+
*/
154+
function exceedsThreshold(threshold, change, absoluteDelta) {
155+
if (threshold.type === 'absolute') {
156+
return absoluteDelta > threshold.value;
157+
}
158+
return change > threshold.value;
159+
}
160+
161+
/**
162+
* Format a threshold for display.
163+
* @param {MetricThreshold} threshold
164+
* @param {string} unit
165+
* @returns {string}
166+
*/
167+
function formatThreshold(threshold, unit) {
168+
if (threshold.type === 'absolute') {
169+
return `${threshold.value}${unit}`;
170+
}
171+
return `${(threshold.value * 100).toFixed(0)}%`;
172+
}
173+
107174
// -- Metrics -----------------------------------------------------------------
108175

109176
/**
@@ -681,7 +748,7 @@ function formatCompareLink(base, test) {
681748
*
682749
* @param {Record<string, any>} jsonReport
683750
* @param {Record<string, any> | null} baseline
684-
* @param {{ threshold: number, runs: number, baselineBuild?: string, build?: string }} opts
751+
* @param {{ threshold: number, metricThresholds: Record<string, number | string>, runs: number, baselineBuild?: string, build?: string }} opts
685752
*/
686753
function generateCISummary(jsonReport, baseline, opts) {
687754
const baseLabel = opts.baselineBuild || 'baseline';
@@ -738,16 +805,18 @@ function generateCISummary(jsonReport, baseline, opts) {
738805
const ttest = welchTTest(basRaw, curRaw);
739806
const pStr = ttest ? `${ttest.pValue}` : 'n/a';
740807

808+
const metricThreshold = getMetricThreshold(opts, metric);
809+
const absoluteDelta = cur.median - bas.median;
741810
let verdict = '';
742811
if (isRegressionMetric) {
743-
if (change > opts.threshold) {
812+
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
744813
if (!ttest || ttest.significant) {
745814
verdict = 'REGRESSION';
746815
totalRegressions++;
747816
} else {
748817
verdict = 'noise';
749818
}
750-
} else if (change < -opts.threshold && ttest?.significant) {
819+
} else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) {
751820
verdict = 'improved';
752821
totalImprovements++;
753822
} else {
@@ -784,7 +853,19 @@ function generateCISummary(jsonReport, baseline, opts) {
784853
lines.push(`| **Diff** | ${compareLink} |`);
785854
}
786855
lines.push(`| **Runs per scenario** | ${opts.runs} |`);
787-
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);
856+
const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => {
857+
const parsed = parseMetricThreshold(v);
858+
return parsed.type !== 'fraction' || parsed.value !== opts.threshold;
859+
});
860+
if (overrides.length > 0) {
861+
const overrideStr = overrides.map(([k, v]) => {
862+
const parsed = parseMetricThreshold(v);
863+
return `${k}: ${parsed.type === 'absolute' ? `${parsed.value}${k.includes('Ms') || k.includes('Time') || k.includes('time') ? 'ms' : ''}` : `${(parsed.value * 100).toFixed(0)}%`}`;
864+
}).join(', ');
865+
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`);
866+
} else {
867+
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);
868+
}
788869
lines.push(`| **Scenarios** | ${scenarios.length} |`);
789870
lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`);
790871
lines.push('');
@@ -1324,8 +1405,10 @@ async function printComparison(jsonReport, opts) {
13241405
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
13251406
const ttest = welchTTest(basRaw, curRaw);
13261407

1408+
const metricThreshold = getMetricThreshold(opts, metric);
1409+
const absoluteDelta = cur.median - bas.median;
13271410
let flag = '';
1328-
if (change > opts.threshold) {
1411+
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
13291412
if (!ttest) {
13301413
flag = ' ← possible regression (n too small for significance test)';
13311414
inconclusiveFound = true;
@@ -1405,6 +1488,7 @@ async function printComparison(jsonReport, opts) {
14051488
: null;
14061489
const summary = generateCISummary(jsonReport, ciBaseline, {
14071490
threshold: opts.threshold,
1491+
metricThresholds: opts.metricThresholds,
14081492
runs: jsonReport.runsPerScenario || opts.runs,
14091493
baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild,
14101494
build: opts.build,

0 commit comments

Comments
 (0)