Skip to content

Commit 7150f9c

Browse files
committed
output improvements
1 parent 543cc19 commit 7150f9c

3 files changed

Lines changed: 180 additions & 65 deletions

File tree

.github/workflows/chat-perf.yml

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -395,9 +395,36 @@ jobs:
395395
echo "" >> "$GITHUB_STEP_SUMMARY"
396396
echo "## Memory Leak Check" >> "$GITHUB_STEP_SUMMARY"
397397
echo "" >> "$GITHUB_STEP_SUMMARY"
398-
echo '```json' >> "$GITHUB_STEP_SUMMARY"
399-
cat leak-results/.chat-simulation-data/chat-simulation-leak-results.json >> "$GITHUB_STEP_SUMMARY"
400-
echo '```' >> "$GITHUB_STEP_SUMMARY"
398+
399+
node -e "
400+
const r = JSON.parse(require('fs').readFileSync('leak-results/.chat-simulation-data/chat-simulation-leak-results.json', 'utf-8'));
401+
const threshold = r.leakThresholdMB || 10;
402+
const leaked = r.totalResidualMB > threshold;
403+
const verdict = leaked ? '❌ **LEAK DETECTED**' : '✅ **No leak detected**';
404+
const lines = [];
405+
lines.push('| | |');
406+
lines.push('|---|---|');
407+
lines.push('| **Verdict** | ' + verdict + ' |');
408+
lines.push('| **Threshold** | ' + threshold + ' MB |');
409+
lines.push('| **Iterations** | ' + (r.iterationCount || r.iterations.length) + ' (+ 1 warmup) |');
410+
lines.push('| **Scenarios per iteration** | ' + (r.scenarioCount || '—') + ' |');
411+
lines.push('');
412+
lines.push('| Phase | Heap (MB) | DOM Nodes |');
413+
lines.push('|-------|----------:|----------:|');
414+
lines.push('| Baseline (post-warmup) | ' + r.baseline.heapMB + ' | ' + r.baseline.domNodes + ' |');
415+
for (let i = 0; i < r.iterations.length; i++) {
416+
const it = r.iterations[i];
417+
const sign = it.deltaHeapMB > 0 ? '+' : '';
418+
const domSign = it.deltaDomNodes > 0 ? '+' : '';
419+
lines.push('| Iteration ' + (i + 1) + ' | ' + it.afterHeapMB + ' (' + sign + it.deltaHeapMB + ') | ' + it.afterDomNodes + ' (' + domSign + it.deltaDomNodes + ') |');
420+
}
421+
lines.push('| **Final** | **' + r.final.heapMB + '** | **' + r.final.domNodes + '** |');
422+
lines.push('');
423+
const sign = r.totalResidualMB > 0 ? '+' : '';
424+
const domSign = r.totalResidualNodes > 0 ? '+' : '';
425+
lines.push('**Total residual growth:** ' + sign + r.totalResidualMB + ' MB heap, ' + domSign + r.totalResidualNodes + ' DOM nodes');
426+
console.log(lines.join('\n'));
427+
" >> "$GITHUB_STEP_SUMMARY"
401428
fi
402429
403430
- name: Zip diagnostic outputs

scripts/chat-simulation/test-chat-mem-leaks.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,13 @@ async function main() {
396396

397397
// Write JSON
398398
const jsonPath = path.join(DATA_DIR, 'chat-simulation-leak-results.json');
399-
fs.writeFileSync(jsonPath, JSON.stringify({ timestamp: new Date().toISOString(), ...result }, null, 2));
399+
fs.writeFileSync(jsonPath, JSON.stringify({
400+
timestamp: new Date().toISOString(),
401+
leakThresholdMB: opts.leakThresholdMB,
402+
iterationCount: opts.iterations,
403+
scenarioCount: getScenarioIds().length,
404+
...result,
405+
}, null, 2));
400406
console.log(`[chat-simulation] Results written to ${jsonPath}`);
401407

402408
const leaked = result.totalResidualMB > opts.leakThresholdMB;

scripts/chat-simulation/test-chat-perf-regression.js

Lines changed: 143 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,69 @@ function generateCISummary(jsonReport, baseline, opts) {
712712
const lines = [];
713713
const scenarios = Object.keys(jsonReport.scenarios);
714714

715-
lines.push(`# Chat Performance Comparison`);
715+
// -- Collect verdicts per scenario/metric --------------------------------
716+
/** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]>} */
717+
const scenarioVerdicts = new Map();
718+
let totalRegressions = 0;
719+
let totalImprovements = 0;
720+
721+
for (const scenario of scenarios) {
722+
const current = jsonReport.scenarios[scenario];
723+
const base = baseline?.scenarios?.[scenario];
724+
/** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */
725+
const verdicts = [];
726+
727+
if (base) {
728+
for (const [metric, group, unit] of allMetrics) {
729+
const cur = current[group]?.[metric];
730+
const bas = base[group]?.[metric];
731+
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
732+
733+
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
734+
const isRegressionMetric = regressionMetricNames.has(metric);
735+
736+
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
737+
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
738+
const ttest = welchTTest(basRaw, curRaw);
739+
const pStr = ttest ? `${ttest.pValue}` : 'n/a';
740+
741+
let verdict = '';
742+
if (isRegressionMetric) {
743+
if (change > opts.threshold) {
744+
if (!ttest || ttest.significant) {
745+
verdict = 'REGRESSION';
746+
totalRegressions++;
747+
} else {
748+
verdict = 'noise';
749+
}
750+
} else if (change < -opts.threshold && ttest?.significant) {
751+
verdict = 'improved';
752+
totalImprovements++;
753+
} else {
754+
verdict = 'ok';
755+
}
756+
} else {
757+
verdict = 'info';
758+
}
759+
760+
const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;
761+
const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;
762+
verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr });
763+
}
764+
}
765+
scenarioVerdicts.set(scenario, verdicts);
766+
}
767+
768+
// -- Header with verdict up front ----------------------------------------
769+
const hasRegressions = totalRegressions > 0;
770+
const verdictIcon = hasRegressions ? '\u274C' : '\u2705';
771+
const verdictText = hasRegressions
772+
? `${totalRegressions} regression(s) detected`
773+
: totalImprovements > 0
774+
? `No regressions \u2014 ${totalImprovements} improvement(s)`
775+
: 'No significant changes';
776+
777+
lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`);
716778
lines.push('');
717779
lines.push(`| | |`);
718780
lines.push(`|---|---|`);
@@ -727,23 +789,85 @@ function generateCISummary(jsonReport, baseline, opts) {
727789
lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`);
728790
lines.push('');
729791

730-
// Overall status
731-
let totalRegressions = 0;
732-
let totalImprovements = 0;
792+
// -- At-a-glance overview table: one row per scenario --------------------
793+
lines.push(`## Overview`);
794+
lines.push('');
795+
lines.push('| Scenario | TTFT | Complete | Layouts | Styles | LoAF | Verdict |');
796+
lines.push('|----------|-----:|---------:|--------:|-------:|-----:|:-------:|');
733797

734-
// Per-scenario tables
735798
for (const scenario of scenarios) {
736-
const current = jsonReport.scenarios[scenario];
799+
const verdicts = scenarioVerdicts.get(scenario) || [];
800+
const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m);
801+
802+
const ttft = get('timeToFirstToken');
803+
const complete = get('timeToComplete');
804+
const layouts = get('layoutCount');
805+
const styles = get('recalcStyleCount');
806+
const loaf = get('longAnimationFrameCount');
807+
808+
const fmtCell = (/** @type {{ change: number, verdict: string } | undefined} */ v) => {
809+
if (!v) { return '\u2014'; }
810+
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`;
811+
return pct;
812+
};
813+
814+
const fmtVerdict = (/** @type {{ verdict: string, change: number }[]} */ vs) => {
815+
const hasRegression = vs.some(v => v.verdict === 'REGRESSION');
816+
const hasImproved = vs.some(v => v.verdict === 'improved');
817+
if (hasRegression) { return '\u274C Regressed'; }
818+
if (hasImproved) { return '\u2B06\uFE0F Improved'; }
819+
return '\u2705 OK';
820+
};
821+
822+
const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean);
823+
const rowVerdict = fmtVerdict(/** @type {any[]} */(keyVerdicts));
824+
825+
lines.push(`| ${scenario} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`);
826+
}
827+
lines.push('');
828+
829+
// -- Regressions & improvements detail section ---------------------------
830+
const hasNotable = [...scenarioVerdicts.values()].some(vs => vs.some(v => v.verdict === 'REGRESSION' || v.verdict === 'improved'));
831+
if (hasNotable) {
832+
lines.push('## Regressions & Improvements');
833+
lines.push('');
834+
lines.push('Only metrics that regressed or improved significantly are shown below.');
835+
lines.push('');
836+
837+
for (const scenario of scenarios) {
838+
const verdicts = scenarioVerdicts.get(scenario) || [];
839+
const notable = verdicts.filter(v => v.verdict === 'REGRESSION' || v.verdict === 'improved');
840+
if (notable.length === 0) { continue; }
841+
842+
const icon = notable.some(v => v.verdict === 'REGRESSION') ? '\u274C' : '\u2B06\uFE0F';
843+
lines.push(`### ${icon} ${scenario}`);
844+
lines.push('');
845+
lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |');
846+
lines.push('|--------|----------|------|--------|---------|---------|');
847+
for (const v of notable) {
848+
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
849+
const verdictIcon = v.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F';
850+
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictIcon} ${v.verdict} |`);
851+
}
852+
lines.push('');
853+
}
854+
}
855+
856+
// -- Full metric tables in collapsible section ---------------------------
857+
lines.push('<details><summary>Full metric details per scenario</summary>');
858+
lines.push('');
859+
860+
for (const scenario of scenarios) {
861+
const verdicts = scenarioVerdicts.get(scenario) || [];
737862
const base = baseline?.scenarios?.[scenario];
738863

739-
lines.push(`## ${scenario}`);
864+
lines.push(`### ${scenario}`);
740865
lines.push('');
741866

742867
if (!base) {
868+
const current = jsonReport.scenarios[scenario];
743869
lines.push('> No baseline data for this scenario.');
744870
lines.push('');
745-
746-
// Show absolute values
747871
lines.push('| Metric | Value | StdDev | CV | n |');
748872
lines.push('|--------|------:|-------:|---:|--:|');
749873
for (const [metric, group, unit] of allMetrics) {
@@ -758,63 +882,21 @@ function generateCISummary(jsonReport, baseline, opts) {
758882
lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`);
759883
lines.push(`|--------|----------|------|--------|---------|---------|`);
760884

761-
for (const [metric, group, unit] of allMetrics) {
762-
const cur = current[group]?.[metric];
763-
const bas = base[group]?.[metric];
764-
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
765-
766-
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
767-
const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
768-
const isRegressionMetric = regressionMetricNames.has(metric);
769-
770-
// t-test
771-
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
772-
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
773-
const ttest = welchTTest(basRaw, curRaw);
774-
const pStr = ttest ? `${ttest.pValue}` : 'n/a';
775-
776-
let verdict = '';
777-
if (isRegressionMetric) {
778-
if (change > opts.threshold) {
779-
if (!ttest) {
780-
verdict = 'REGRESSION';
781-
totalRegressions++;
782-
} else if (ttest.significant) {
783-
verdict = 'REGRESSION';
784-
totalRegressions++;
785-
} else {
786-
verdict = 'noise';
787-
}
788-
} else if (change < -opts.threshold && ttest?.significant) {
789-
verdict = 'improved';
790-
totalImprovements++;
791-
} else {
792-
verdict = 'ok';
793-
}
794-
} else {
795-
verdict = 'info';
796-
}
797-
798-
const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;
799-
const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;
800-
lines.push(`| ${metric} | ${basStr} | ${curStr} | ${pct} | ${pStr} | ${verdict} |`);
885+
for (const v of verdicts) {
886+
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
887+
let verdictDisplay = v.verdict;
888+
if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; }
889+
else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; }
890+
else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; }
891+
else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; }
892+
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`);
801893
}
802894
lines.push('');
803895
}
804-
805-
// Grand summary
806-
lines.push('## Summary');
807-
lines.push('');
808-
if (totalRegressions > 0) {
809-
lines.push(`**${totalRegressions} regression(s) detected** across ${scenarios.length} scenario(s).`);
810-
} else if (totalImprovements > 0) {
811-
lines.push(`**No regressions.** ${totalImprovements} improvement(s) detected.`);
812-
} else {
813-
lines.push(`**No significant changes** across ${scenarios.length} scenario(s).`);
814-
}
896+
lines.push('</details>');
815897
lines.push('');
816898

817-
// Raw data per scenario
899+
// -- Raw run data in collapsible section ---------------------------------
818900
lines.push('<details><summary>Raw run data</summary>');
819901
lines.push('');
820902
for (const scenario of scenarios) {

0 commit comments

Comments
 (0)