Skip to content

Commit 01c6be5

Browse files
Copilotpelikhan
andauthored
feat: add StatVar for numerically stable statistics in audit and logs commands (#26479)
* feat: add StatVar for numerically stable statistics in audit and logs commands - Add pkg/stats.StatVar using Welford's online algorithm (mean, variance, stddev, median, min, max, count, sum) - Add MedianTimeBetweenTurns and StdDevTimeBetweenTurns to workflow.LogMetrics - Use StatVar in copilot_events_jsonl.go for TBT computation - Add median/stddev fields to MetricsTrendData in audit_cross_run.go - Use StatVar in buildMetricsTrend() replacing manual accumulation - Use StatVar in health_metrics.go CalculateWorkflowHealth() Agent-Logs-Url: https://github.com/github/gh-aw/sessions/30f4a60c-5387-4216-b4a2-56643a7c1967 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> * docs: address code review feedback - document statistical choices and memory usage Agent-Logs-Url: https://github.com/github/gh-aw/sessions/30f4a60c-5387-4216-b4a2-56643a7c1967 Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> * fix: address code review - remove unused totalDuration, fix duplicate comment, clarify NaN doc - health_metrics.go: remove unused totalDuration variable - audit_cross_run.go: remove duplicate MetricsTrendData comment block - statvar.go: clarify Add() doc to accurately reflect NaN/Min/Max behavior Agent-Logs-Url: https://github.com/github/gh-aw/sessions/b393d657-9b75-407b-8c0e-e9dcd97e4bbe Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com> Co-authored-by: Peli de Halleux <pelikhan@users.noreply.github.com>
1 parent c9aeb7b commit 01c6be5

File tree

6 files changed

+395
-99
lines changed

6 files changed

+395
-99
lines changed

pkg/cli/audit_cross_run.go

Lines changed: 74 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"time"
66

77
"github.com/github/gh-aw/pkg/logger"
8+
"github.com/github/gh-aw/pkg/stats"
89
)
910

1011
var auditCrossRunLog = logger.New("cli:audit_cross_run")
@@ -46,25 +47,42 @@ type CrossRunSummary struct {
4647

4748
// MetricsTrendData contains aggregated cost, token, turn, and duration statistics
4849
// across multiple runs, with spike detection for anomalous runs.
50+
//
51+
// Token counts (MinTokens, MaxTokens, AvgTokens) are stored as int to preserve
52+
// integer semantics consistent with the source data; MedianTokens and StdDevTokens
53+
// use float64 because statistical measures of integer quantities can be fractional.
54+
//
55+
// Duration fields only aggregate runs where timing data was recorded (duration > 0),
56+
// so the duration statistics may cover fewer runs than the cost/token/turn statistics.
57+
// All stddev fields use the sample standard deviation (Bessel's correction).
4958
type MetricsTrendData struct {
50-
TotalCost float64 `json:"total_cost"`
51-
AvgCost float64 `json:"avg_cost"`
52-
MinCost float64 `json:"min_cost"`
53-
MaxCost float64 `json:"max_cost"`
54-
TotalTokens int `json:"total_tokens"`
55-
AvgTokens int `json:"avg_tokens"`
56-
MinTokens int `json:"min_tokens"`
57-
MaxTokens int `json:"max_tokens"`
58-
TotalTurns int `json:"total_turns"`
59-
AvgTurns float64 `json:"avg_turns"`
60-
MaxTurns int `json:"max_turns"`
61-
// Duration statistics (stored as nanoseconds for JSON portability)
62-
AvgDurationNs int64 `json:"avg_duration_ns"`
63-
MinDurationNs int64 `json:"min_duration_ns"`
64-
MaxDurationNs int64 `json:"max_duration_ns"`
65-
CostSpikes []int64 `json:"cost_spikes,omitempty"` // Run IDs with cost > 2x avg
66-
TokenSpikes []int64 `json:"token_spikes,omitempty"` // Run IDs with tokens > 2x avg
67-
RunsWithCost int `json:"runs_with_cost"` // Runs that reported non-zero cost
59+
TotalCost float64 `json:"total_cost"`
60+
AvgCost float64 `json:"avg_cost"`
61+
MedianCost float64 `json:"median_cost"`
62+
StdDevCost float64 `json:"stddev_cost"`
63+
MinCost float64 `json:"min_cost"`
64+
MaxCost float64 `json:"max_cost"`
65+
TotalTokens int `json:"total_tokens"`
66+
AvgTokens int `json:"avg_tokens"`
67+
MedianTokens float64 `json:"median_tokens"` // float64: median of integer counts can be fractional
68+
StdDevTokens float64 `json:"stddev_tokens"` // float64: stddev is always fractional
69+
MinTokens int `json:"min_tokens"`
70+
MaxTokens int `json:"max_tokens"`
71+
TotalTurns int `json:"total_turns"`
72+
AvgTurns float64 `json:"avg_turns"`
73+
MedianTurns float64 `json:"median_turns"`
74+
StdDevTurns float64 `json:"stddev_turns"`
75+
MaxTurns int `json:"max_turns"`
76+
// Duration statistics (stored as nanoseconds for JSON portability).
77+
// Only runs with duration > 0 contribute; runs without timing data are excluded.
78+
AvgDurationNs int64 `json:"avg_duration_ns"`
79+
MedianDurationNs int64 `json:"median_duration_ns"`
80+
StdDevDurationNs int64 `json:"stddev_duration_ns"`
81+
MinDurationNs int64 `json:"min_duration_ns"`
82+
MaxDurationNs int64 `json:"max_duration_ns"`
83+
CostSpikes []int64 `json:"cost_spikes,omitempty"` // Run IDs with cost > 2x avg
84+
TokenSpikes []int64 `json:"token_spikes,omitempty"` // Run IDs with tokens > 2x avg
85+
RunsWithCost int `json:"runs_with_cost"` // Runs that reported non-zero cost
6886
}
6987

7088
// MCPServerCrossRunHealth describes the health of a single MCP server across runs.
@@ -378,71 +396,65 @@ type metricsRawRow struct {
378396
duration time.Duration
379397
}
380398

381-
// buildMetricsTrend computes aggregate metrics (min/max/avg/total, spike detection)
382-
// from a slice of per-run raw metric rows.
399+
// buildMetricsTrend computes aggregate metrics (min/max/avg/median/stddev/total, spike
400+
// detection) from a slice of per-run raw metric rows. Mean and variance are computed
401+
// using Welford's online algorithm via StatVar for numerical stability.
383402
func buildMetricsTrend(rows []metricsRawRow) MetricsTrendData {
384403
auditCrossRunLog.Printf("Building metrics trend from %d rows", len(rows))
385404
if len(rows) == 0 {
386405
return MetricsTrendData{}
387406
}
388407

389-
trend := MetricsTrendData{
390-
MinCost: rows[0].cost,
391-
MaxCost: rows[0].cost,
392-
MinTokens: rows[0].tokens,
393-
MaxTokens: rows[0].tokens,
394-
}
395-
396-
var totalDuration time.Duration
397-
var minDuration, maxDuration time.Duration
408+
var costStats, tokenStats, turnStats, durationStats stats.StatVar
398409

399-
for i, r := range rows {
410+
trend := MetricsTrendData{}
411+
for _, r := range rows {
400412
trend.TotalCost += r.cost
401413
trend.TotalTokens += r.tokens
402414
trend.TotalTurns += r.turns
403415

404416
if r.cost > 0 {
405417
trend.RunsWithCost++
406418
}
407-
if r.cost < trend.MinCost {
408-
trend.MinCost = r.cost
409-
}
410-
if r.cost > trend.MaxCost {
411-
trend.MaxCost = r.cost
412-
}
413-
if r.tokens < trend.MinTokens {
414-
trend.MinTokens = r.tokens
415-
}
416-
if r.tokens > trend.MaxTokens {
417-
trend.MaxTokens = r.tokens
418-
}
419419
if r.turns > trend.MaxTurns {
420420
trend.MaxTurns = r.turns
421421
}
422422

423-
// Duration stats: only include runs where duration was measured
424-
if i == 0 {
425-
minDuration = r.duration
426-
maxDuration = r.duration
427-
} else {
428-
if r.duration > 0 && (r.duration < minDuration || minDuration == 0) {
429-
minDuration = r.duration
430-
}
431-
if r.duration > maxDuration {
432-
maxDuration = r.duration
433-
}
423+
costStats.Add(r.cost)
424+
tokenStats.Add(float64(r.tokens))
425+
turnStats.Add(float64(r.turns))
426+
// Only include runs where duration was measured to avoid pulling the
427+
// statistics toward zero for runs without timing data.
428+
if r.duration > 0 {
429+
durationStats.Add(float64(r.duration))
434430
}
435-
totalDuration += r.duration
436431
}
437432

438-
n := len(rows)
439-
if n > 0 {
440-
trend.AvgCost = trend.TotalCost / float64(n)
441-
trend.AvgTokens = trend.TotalTokens / n
442-
trend.AvgTurns = float64(trend.TotalTurns) / float64(n)
443-
trend.AvgDurationNs = int64(totalDuration) / int64(n)
444-
trend.MinDurationNs = int64(minDuration)
445-
trend.MaxDurationNs = int64(maxDuration)
433+
if costStats.Count() > 0 {
434+
trend.AvgCost = costStats.Mean()
435+
trend.MedianCost = costStats.Median()
436+
trend.StdDevCost = costStats.SampleStdDev()
437+
trend.MinCost = costStats.Min()
438+
trend.MaxCost = costStats.Max()
439+
}
440+
if tokenStats.Count() > 0 {
441+
trend.AvgTokens = int(tokenStats.Mean())
442+
trend.MedianTokens = tokenStats.Median()
443+
trend.StdDevTokens = tokenStats.SampleStdDev()
444+
trend.MinTokens = int(tokenStats.Min())
445+
trend.MaxTokens = int(tokenStats.Max())
446+
}
447+
if turnStats.Count() > 0 {
448+
trend.AvgTurns = turnStats.Mean()
449+
trend.MedianTurns = turnStats.Median()
450+
trend.StdDevTurns = turnStats.SampleStdDev()
451+
}
452+
if durationStats.Count() > 0 {
453+
trend.AvgDurationNs = int64(durationStats.Mean())
454+
trend.MedianDurationNs = int64(durationStats.Median())
455+
trend.StdDevDurationNs = int64(durationStats.SampleStdDev())
456+
trend.MinDurationNs = int64(durationStats.Min())
457+
trend.MaxDurationNs = int64(durationStats.Max())
446458
}
447459

448460
// Spike detection: > spikeDetectionMultiplier × average

pkg/cli/copilot_events_jsonl.go

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222

2323
"github.com/github/gh-aw/pkg/console"
2424
"github.com/github/gh-aw/pkg/logger"
25+
"github.com/github/gh-aw/pkg/stats"
2526
"github.com/github/gh-aw/pkg/workflow"
2627
)
2728

@@ -282,26 +283,23 @@ func parseEventsJSONLFile(path string, verbose bool) (workflow.LogMetrics, error
282283
// Compute Time Between Turns (TBT) from per-turn timestamps.
283284
// TBT[i] = timestamp[i] - timestamp[i-1] for i > 0. Two or more timestamps
284285
// are required to measure at least one interval. Only positive intervals are
285-
// included so that identical or out-of-order timestamps don't skew the average.
286+
// included so that identical or out-of-order timestamps don't skew the statistics.
286287
if len(turnTimestamps) >= 2 {
287-
var totalTBT time.Duration
288-
var maxTBT time.Duration
289-
validIntervals := 0
288+
var tbtStats stats.StatVar
290289
for i := 1; i < len(turnTimestamps); i++ {
291290
tbt := turnTimestamps[i].Sub(turnTimestamps[i-1])
292291
if tbt > 0 {
293-
totalTBT += tbt
294-
validIntervals++
295-
if tbt > maxTBT {
296-
maxTBT = tbt
297-
}
292+
tbtStats.Add(float64(tbt))
298293
}
299294
}
300-
if validIntervals > 0 {
301-
metrics.AvgTimeBetweenTurns = totalTBT / time.Duration(validIntervals)
302-
metrics.MaxTimeBetweenTurns = maxTBT
303-
copilotEventsJSONLLog.Printf("TBT computed: avg=%s max=%s intervals=%d",
304-
metrics.AvgTimeBetweenTurns, metrics.MaxTimeBetweenTurns, validIntervals)
295+
if tbtStats.Count() > 0 {
296+
metrics.AvgTimeBetweenTurns = time.Duration(tbtStats.Mean())
297+
metrics.MaxTimeBetweenTurns = time.Duration(tbtStats.Max())
298+
metrics.MedianTimeBetweenTurns = time.Duration(tbtStats.Median())
299+
metrics.StdDevTimeBetweenTurns = time.Duration(tbtStats.SampleStdDev())
300+
copilotEventsJSONLLog.Printf("TBT computed: avg=%s max=%s median=%s stddev=%s intervals=%d",
301+
metrics.AvgTimeBetweenTurns, metrics.MaxTimeBetweenTurns,
302+
metrics.MedianTimeBetweenTurns, metrics.StdDevTimeBetweenTurns, tbtStats.Count())
305303
}
306304
}
307305

pkg/cli/health_metrics.go

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"time"
77

88
"github.com/github/gh-aw/pkg/logger"
9+
"github.com/github/gh-aw/pkg/stats"
910
"github.com/github/gh-aw/pkg/timeutil"
1011
)
1112

@@ -78,10 +79,10 @@ func CalculateWorkflowHealth(workflowName string, runs []WorkflowRun, threshold
7879
}
7980
}
8081

81-
// Calculate success and failure counts
82+
// Accumulate success/failure counts and numerical metrics.
8283
successCount := 0
8384
failureCount := 0
84-
var totalDuration time.Duration
85+
var durationStats, tokenStats, costStats stats.StatVar
8586
var totalTokens int
8687
var totalCost float64
8788

@@ -91,9 +92,11 @@ func CalculateWorkflowHealth(workflowName string, runs []WorkflowRun, threshold
9192
} else if isFailureConclusion(run.Conclusion) {
9293
failureCount++
9394
}
94-
totalDuration += run.Duration
9595
totalTokens += run.TokenUsage
9696
totalCost += run.EstimatedCost
97+
durationStats.Add(float64(run.Duration))
98+
tokenStats.Add(float64(run.TokenUsage))
99+
costStats.Add(run.EstimatedCost)
97100
}
98101

99102
totalRuns := len(runs)
@@ -102,19 +105,9 @@ func CalculateWorkflowHealth(workflowName string, runs []WorkflowRun, threshold
102105
successRate = float64(successCount) / float64(totalRuns) * 100
103106
}
104107

105-
// Calculate average duration
106-
avgDuration := time.Duration(0)
107-
if totalRuns > 0 {
108-
avgDuration = totalDuration / time.Duration(totalRuns)
109-
}
110-
111-
// Calculate average tokens and cost
112-
avgTokens := 0
113-
avgCost := 0.0
114-
if totalRuns > 0 {
115-
avgTokens = totalTokens / totalRuns
116-
avgCost = totalCost / float64(totalRuns)
117-
}
108+
avgDuration := time.Duration(durationStats.Mean())
109+
avgTokens := int(tokenStats.Mean())
110+
avgCost := costStats.Mean()
118111

119112
// Calculate trend
120113
trend := calculateTrend(runs)

0 commit comments

Comments
 (0)