@@ -712,7 +712,69 @@ function generateCISummary(jsonReport, baseline, opts) {
712712 const lines = [ ] ;
713713 const scenarios = Object . keys ( jsonReport . scenarios ) ;
714714
715- lines . push ( `# Chat Performance Comparison` ) ;
715+ // -- Collect verdicts per scenario/metric --------------------------------
716+ /** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]> } */
717+ const scenarioVerdicts = new Map ( ) ;
718+ let totalRegressions = 0 ;
719+ let totalImprovements = 0 ;
720+
721+ for ( const scenario of scenarios ) {
722+ const current = jsonReport . scenarios [ scenario ] ;
723+ const base = baseline ?. scenarios ?. [ scenario ] ;
724+ /** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[] } */
725+ const verdicts = [ ] ;
726+
727+ if ( base ) {
728+ for ( const [ metric , group , unit ] of allMetrics ) {
729+ const cur = current [ group ] ?. [ metric ] ;
730+ const bas = base [ group ] ?. [ metric ] ;
731+ if ( ! cur || ! bas || bas . median === null || bas . median === undefined ) { continue ; }
732+
733+ const change = bas . median !== 0 ? ( cur . median - bas . median ) / bas . median : 0 ;
734+ const isRegressionMetric = regressionMetricNames . has ( metric ) ;
735+
736+ const curRaw = ( current . rawRuns || [ ] ) . map ( ( /** @type {any } */ r ) => r [ metric ] ) . filter ( ( /** @type {any } */ v ) => v >= 0 ) ;
737+ const basRaw = ( base . rawRuns || [ ] ) . map ( ( /** @type {any } */ r ) => r [ metric ] ) . filter ( ( /** @type {any } */ v ) => v >= 0 ) ;
738+ const ttest = welchTTest ( basRaw , curRaw ) ;
739+ const pStr = ttest ? `${ ttest . pValue } ` : 'n/a' ;
740+
741+ let verdict = '' ;
742+ if ( isRegressionMetric ) {
743+ if ( change > opts . threshold ) {
744+ if ( ! ttest || ttest . significant ) {
745+ verdict = 'REGRESSION' ;
746+ totalRegressions ++ ;
747+ } else {
748+ verdict = 'noise' ;
749+ }
750+ } else if ( change < - opts . threshold && ttest ?. significant ) {
751+ verdict = 'improved' ;
752+ totalImprovements ++ ;
753+ } else {
754+ verdict = 'ok' ;
755+ }
756+ } else {
757+ verdict = 'info' ;
758+ }
759+
760+ const basStr = `${ bas . median } ${ unit } \xb1${ bas . stddev } ${ unit } ` ;
761+ const curStr = `${ cur . median } ${ unit } \xb1${ cur . stddev } ${ unit } ` ;
762+ verdicts . push ( { metric, verdict, change, pValue : pStr , basStr, curStr } ) ;
763+ }
764+ }
765+ scenarioVerdicts . set ( scenario , verdicts ) ;
766+ }
767+
768+ // -- Header with verdict up front ----------------------------------------
769+ const hasRegressions = totalRegressions > 0 ;
770+ const verdictIcon = hasRegressions ? '\u274C' : '\u2705' ;
771+ const verdictText = hasRegressions
772+ ? `${ totalRegressions } regression(s) detected`
773+ : totalImprovements > 0
774+ ? `No regressions \u2014 ${ totalImprovements } improvement(s)`
775+ : 'No significant changes' ;
776+
777+ lines . push ( `# ${ verdictIcon } Chat Performance: ${ verdictText } ` ) ;
716778 lines . push ( '' ) ;
717779 lines . push ( `| | |` ) ;
718780 lines . push ( `|---|---|` ) ;
@@ -727,23 +789,85 @@ function generateCISummary(jsonReport, baseline, opts) {
727789 lines . push ( `| **Platform** | ${ process . platform } / ${ process . arch } |` ) ;
728790 lines . push ( '' ) ;
729791
730- // Overall status
731- let totalRegressions = 0 ;
732- let totalImprovements = 0 ;
792+ // -- At-a-glance overview table: one row per scenario --------------------
793+ lines . push ( `## Overview` ) ;
794+ lines . push ( '' ) ;
795+ lines . push ( '| Scenario | TTFT | Complete | Layouts | Styles | LoAF | Verdict |' ) ;
796+ lines . push ( '|----------|-----:|---------:|--------:|-------:|-----:|:-------:|' ) ;
733797
734- // Per-scenario tables
735798 for ( const scenario of scenarios ) {
736- const current = jsonReport . scenarios [ scenario ] ;
799+ const verdicts = scenarioVerdicts . get ( scenario ) || [ ] ;
800+ const get = ( /** @type {string } */ m ) => verdicts . find ( v => v . metric === m ) ;
801+
802+ const ttft = get ( 'timeToFirstToken' ) ;
803+ const complete = get ( 'timeToComplete' ) ;
804+ const layouts = get ( 'layoutCount' ) ;
805+ const styles = get ( 'recalcStyleCount' ) ;
806+ const loaf = get ( 'longAnimationFrameCount' ) ;
807+
808+ const fmtCell = ( /** @type {{ change: number, verdict: string } | undefined } */ v ) => {
809+ if ( ! v ) { return '\u2014' ; }
810+ const pct = `${ v . change > 0 ? '+' : '' } ${ ( v . change * 100 ) . toFixed ( 0 ) } %` ;
811+ return pct ;
812+ } ;
813+
814+ const fmtVerdict = ( /** @type {{ verdict: string, change: number }[] } */ vs ) => {
815+ const hasRegression = vs . some ( v => v . verdict === 'REGRESSION' ) ;
816+ const hasImproved = vs . some ( v => v . verdict === 'improved' ) ;
817+ if ( hasRegression ) { return '\u274C Regressed' ; }
818+ if ( hasImproved ) { return '\u2B06\uFE0F Improved' ; }
819+ return '\u2705 OK' ;
820+ } ;
821+
822+ const keyVerdicts = [ ttft , complete , layouts , styles , loaf ] . filter ( Boolean ) ;
823+ const rowVerdict = fmtVerdict ( /** @type {any[] } */ ( keyVerdicts ) ) ;
824+
825+ lines . push ( `| ${ scenario } | ${ fmtCell ( ttft ) } | ${ fmtCell ( complete ) } | ${ fmtCell ( layouts ) } | ${ fmtCell ( styles ) } | ${ fmtCell ( loaf ) } | ${ rowVerdict } |` ) ;
826+ }
827+ lines . push ( '' ) ;
828+
829+ // -- Regressions & improvements detail section ---------------------------
830+ const hasNotable = [ ...scenarioVerdicts . values ( ) ] . some ( vs => vs . some ( v => v . verdict === 'REGRESSION' || v . verdict === 'improved' ) ) ;
831+ if ( hasNotable ) {
832+ lines . push ( '## Regressions & Improvements' ) ;
833+ lines . push ( '' ) ;
834+ lines . push ( 'Only metrics that regressed or improved significantly are shown below.' ) ;
835+ lines . push ( '' ) ;
836+
837+ for ( const scenario of scenarios ) {
838+ const verdicts = scenarioVerdicts . get ( scenario ) || [ ] ;
839+ const notable = verdicts . filter ( v => v . verdict === 'REGRESSION' || v . verdict === 'improved' ) ;
840+ if ( notable . length === 0 ) { continue ; }
841+
842+ const icon = notable . some ( v => v . verdict === 'REGRESSION' ) ? '\u274C' : '\u2B06\uFE0F' ;
843+ lines . push ( `### ${ icon } ${ scenario } ` ) ;
844+ lines . push ( '' ) ;
845+ lines . push ( '| Metric | Baseline | Test | Change | p-value | Verdict |' ) ;
846+ lines . push ( '|--------|----------|------|--------|---------|---------|' ) ;
847+ for ( const v of notable ) {
848+ const pct = `${ v . change > 0 ? '+' : '' } ${ ( v . change * 100 ) . toFixed ( 1 ) } %` ;
849+ const verdictIcon = v . verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F' ;
850+ lines . push ( `| ${ v . metric } | ${ v . basStr } | ${ v . curStr } | ${ pct } | ${ v . pValue } | ${ verdictIcon } ${ v . verdict } |` ) ;
851+ }
852+ lines . push ( '' ) ;
853+ }
854+ }
855+
856+ // -- Full metric tables in collapsible section ---------------------------
857+ lines . push ( '<details><summary>Full metric details per scenario</summary>' ) ;
858+ lines . push ( '' ) ;
859+
860+ for ( const scenario of scenarios ) {
861+ const verdicts = scenarioVerdicts . get ( scenario ) || [ ] ;
737862 const base = baseline ?. scenarios ?. [ scenario ] ;
738863
739- lines . push ( `## ${ scenario } ` ) ;
864+ lines . push ( `### ${ scenario } ` ) ;
740865 lines . push ( '' ) ;
741866
742867 if ( ! base ) {
868+ const current = jsonReport . scenarios [ scenario ] ;
743869 lines . push ( '> No baseline data for this scenario.' ) ;
744870 lines . push ( '' ) ;
745-
746- // Show absolute values
747871 lines . push ( '| Metric | Value | StdDev | CV | n |' ) ;
748872 lines . push ( '|--------|------:|-------:|---:|--:|' ) ;
749873 for ( const [ metric , group , unit ] of allMetrics ) {
@@ -758,63 +882,21 @@ function generateCISummary(jsonReport, baseline, opts) {
758882 lines . push ( `| Metric | Baseline | Test | Change | p-value | Verdict |` ) ;
759883 lines . push ( `|--------|----------|------|--------|---------|---------|` ) ;
760884
761- for ( const [ metric , group , unit ] of allMetrics ) {
762- const cur = current [ group ] ?. [ metric ] ;
763- const bas = base [ group ] ?. [ metric ] ;
764- if ( ! cur || ! bas || bas . median === null || bas . median === undefined ) { continue ; }
765-
766- const change = bas . median !== 0 ? ( cur . median - bas . median ) / bas . median : 0 ;
767- const pct = `${ change > 0 ? '+' : '' } ${ ( change * 100 ) . toFixed ( 1 ) } %` ;
768- const isRegressionMetric = regressionMetricNames . has ( metric ) ;
769-
770- // t-test
771- const curRaw = ( current . rawRuns || [ ] ) . map ( ( /** @type {any } */ r ) => r [ metric ] ) . filter ( ( /** @type {any } */ v ) => v >= 0 ) ;
772- const basRaw = ( base . rawRuns || [ ] ) . map ( ( /** @type {any } */ r ) => r [ metric ] ) . filter ( ( /** @type {any } */ v ) => v >= 0 ) ;
773- const ttest = welchTTest ( basRaw , curRaw ) ;
774- const pStr = ttest ? `${ ttest . pValue } ` : 'n/a' ;
775-
776- let verdict = '' ;
777- if ( isRegressionMetric ) {
778- if ( change > opts . threshold ) {
779- if ( ! ttest ) {
780- verdict = 'REGRESSION' ;
781- totalRegressions ++ ;
782- } else if ( ttest . significant ) {
783- verdict = 'REGRESSION' ;
784- totalRegressions ++ ;
785- } else {
786- verdict = 'noise' ;
787- }
788- } else if ( change < - opts . threshold && ttest ?. significant ) {
789- verdict = 'improved' ;
790- totalImprovements ++ ;
791- } else {
792- verdict = 'ok' ;
793- }
794- } else {
795- verdict = 'info' ;
796- }
797-
798- const basStr = `${ bas . median } ${ unit } \xb1${ bas . stddev } ${ unit } ` ;
799- const curStr = `${ cur . median } ${ unit } \xb1${ cur . stddev } ${ unit } ` ;
800- lines . push ( `| ${ metric } | ${ basStr } | ${ curStr } | ${ pct } | ${ pStr } | ${ verdict } |` ) ;
885+ for ( const v of verdicts ) {
886+ const pct = `${ v . change > 0 ? '+' : '' } ${ ( v . change * 100 ) . toFixed ( 1 ) } %` ;
887+ let verdictDisplay = v . verdict ;
888+ if ( v . verdict === 'REGRESSION' ) { verdictDisplay = '\u274C REGRESSION' ; }
889+ else if ( v . verdict === 'improved' ) { verdictDisplay = '\u2B06\uFE0F improved' ; }
890+ else if ( v . verdict === 'ok' ) { verdictDisplay = '\u2705 ok' ; }
891+ else if ( v . verdict === 'noise' ) { verdictDisplay = '\uD83C\uDF2B\uFE0F noise' ; }
892+ lines . push ( `| ${ v . metric } | ${ v . basStr } | ${ v . curStr } | ${ pct } | ${ v . pValue } | ${ verdictDisplay } |` ) ;
801893 }
802894 lines . push ( '' ) ;
803895 }
804-
805- // Grand summary
806- lines . push ( '## Summary' ) ;
807- lines . push ( '' ) ;
808- if ( totalRegressions > 0 ) {
809- lines . push ( `**${ totalRegressions } regression(s) detected** across ${ scenarios . length } scenario(s).` ) ;
810- } else if ( totalImprovements > 0 ) {
811- lines . push ( `**No regressions.** ${ totalImprovements } improvement(s) detected.` ) ;
812- } else {
813- lines . push ( `**No significant changes** across ${ scenarios . length } scenario(s).` ) ;
814- }
896+ lines . push ( '</details>' ) ;
815897 lines . push ( '' ) ;
816898
817- // Raw data per scenario
899+ // -- Raw run data in collapsible section ---------------------------------
818900 lines . push ( '<details><summary>Raw run data</summary>' ) ;
819901 lines . push ( '' ) ;
820902 for ( const scenario of scenarios ) {
0 commit comments