Skip to content

Commit 6c1f9b2

Browse files
committed
Exclude outliers from the benchmark and add coefficient of variation evaluation
1 parent 433ad7b commit 6c1f9b2

1 file changed

Lines changed: 131 additions & 26 deletions

File tree

test/systemTests/benchmarkRunner.ts

Lines changed: 131 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ interface BenchmarkResults {
1212
median: number;
1313
min: number;
1414
max: number;
15+
standardDeviation: number;
16+
confidenceInterval95: { lower: number; upper: number };
1517
totalDuration: number;
1618
};
1719
documentFormatting: {
@@ -23,6 +25,8 @@ interface BenchmarkResults {
2325
median: number;
2426
min: number;
2527
max: number;
28+
standardDeviation: number;
29+
confidenceInterval95: { lower: number; upper: number };
2630
totalDuration: number;
2731
};
2832
};
@@ -36,11 +40,11 @@ class PerformanceBenchmark {
3640
private overallStartTime: bigint = BigInt(0);
3741

3842
private readonly baseline = {
39-
factoryCreation: { average: 0.0001675, median: 0.0001522 },
43+
factoryCreation: { average: 0.0001675, median: 0.0001522, standardDeviation: 0.0001 },
4044
documentFormatting: {
41-
small: { average: 0.1429, median: 0.1162 },
42-
medium: { average: 0.2358, median: 0.2186 },
43-
large: { average: 4.2929, median: 2.2453 }
45+
small: { average: 0.1429, median: 0.1162, standardDeviation: 0.05 },
46+
medium: { average: 0.2358, median: 0.2186, standardDeviation: 0.08 },
47+
large: { average: 4.2929, median: 2.2453, standardDeviation: 2.0 }
4448
}
4549
};
4650

@@ -55,6 +59,8 @@ class PerformanceBenchmark {
5559
median: 0,
5660
min: 0,
5761
max: 0,
62+
standardDeviation: 0,
63+
confidenceInterval95: { lower: 0, upper: 0 },
5864
totalDuration: 0
5965
},
6066
documentFormatting: {},
@@ -150,6 +156,13 @@ class PerformanceBenchmark {
150156

151157
async benchmarkFactoryCreation(iterations: number = 100000): Promise<void> {
152158
console.log('📦 Testing factory creation overhead...');
159+
160+
// Warmup phase to mitigate JIT compilation effects
161+
console.log(' 🔥 Warming up JIT compiler...');
162+
for (let i = 0; i < Math.min(10000, iterations / 10); i++) {
163+
getDocumentPrettyfier();
164+
}
165+
153166
const times: number[] = [];
154167
const sectionStart = process.hrtime.bigint();
155168

@@ -163,13 +176,23 @@ class PerformanceBenchmark {
163176
const sectionEnd = process.hrtime.bigint();
164177
const totalDuration = Number(sectionEnd - sectionStart) / 1_000_000; // Convert to milliseconds
165178

179+
// Remove outliers for more stable results
180+
const cleanTimes = this.removeOutliers(times);
181+
console.log(` 📊 Removed ${times.length - cleanTimes.length} outliers (${((times.length - cleanTimes.length) / times.length * 100).toFixed(1)}%)`);
182+
183+
const average = cleanTimes.reduce((a, b) => a + b, 0) / cleanTimes.length;
184+
const standardDeviation = this.calculateStandardDeviation(cleanTimes);
185+
const confidenceInterval95 = this.calculateConfidenceInterval(cleanTimes);
186+
166187
this.results.factoryCreation = {
167188
iterations,
168-
times,
169-
average: times.reduce((a, b) => a + b, 0) / times.length,
170-
median: this.calculateMedian(times),
171-
min: Math.min(...times),
172-
max: Math.max(...times),
189+
times: cleanTimes,
190+
average,
191+
median: this.calculateMedian(cleanTimes),
192+
min: Math.min(...cleanTimes),
193+
max: Math.max(...cleanTimes),
194+
standardDeviation,
195+
confidenceInterval95,
173196
totalDuration
174197
};
175198
}
@@ -184,9 +207,22 @@ class PerformanceBenchmark {
184207
}, {} as Record<string, typeof this.testFiles>);
185208

186209
for (const [size, files] of Object.entries(sizeGroups)) {
210+
console.log(` 📋 Testing ${size} files...`);
187211
const iterations = this.getIterationsForSize(size as any);
188212
const times: number[] = [];
213+
214+
// Create prettyfier once and reuse
189215
const prettyfier = getDocumentPrettyfier();
216+
217+
// Warmup phase - process each file a few times
218+
console.log(` 🔥 Warming up with ${size} files...`);
219+
const warmupIterations = Math.min(50, Math.floor(iterations / 20));
220+
for (let i = 0; i < warmupIterations; i++) {
221+
const file = files[i % files.length];
222+
const document = await this.openDocument(`resources/${file.name}-input.md`);
223+
prettyfier.provideDocumentFormattingEdits(document, {} as any, {} as any);
224+
}
225+
190226
const sectionStart = process.hrtime.bigint();
191227

192228
for (let i = 0; i < iterations; i++) {
@@ -203,14 +239,24 @@ class PerformanceBenchmark {
203239
const sectionEnd = process.hrtime.bigint();
204240
const totalDuration = Number(sectionEnd - sectionStart) / 1_000_000; // Convert to milliseconds
205241

242+
// Remove outliers for more stable results
243+
const cleanTimes = this.removeOutliers(times);
244+
console.log(` 📊 Removed ${times.length - cleanTimes.length} outliers (${((times.length - cleanTimes.length) / times.length * 100).toFixed(1)}%)`);
245+
246+
const average = cleanTimes.reduce((a, b) => a + b, 0) / cleanTimes.length;
247+
const standardDeviation = this.calculateStandardDeviation(cleanTimes);
248+
const confidenceInterval95 = this.calculateConfidenceInterval(cleanTimes);
249+
206250
this.results.documentFormatting[size] = {
207251
files: files.map(f => f.name),
208252
iterations,
209-
times,
210-
average: times.reduce((a, b) => a + b, 0) / times.length,
211-
median: this.calculateMedian(times),
212-
min: Math.min(...times),
213-
max: Math.max(...times),
253+
times: cleanTimes,
254+
average,
255+
median: this.calculateMedian(cleanTimes),
256+
min: Math.min(...cleanTimes),
257+
max: Math.max(...cleanTimes),
258+
standardDeviation,
259+
confidenceInterval95,
214260
totalDuration
215261
};
216262
}
@@ -224,10 +270,10 @@ class PerformanceBenchmark {
224270

225271
private getIterationsForSize(size: 'small' | 'medium' | 'large'): number {
226272
switch (size) {
227-
case 'small': return 15000;
228-
case 'medium': return 10000;
229-
case 'large': return 750;
230-
default: return 10000;
273+
case 'small': return 25000;
274+
case 'medium': return 8000;
275+
case 'large': return 600;
276+
default: return 15000;
231277
}
232278
}
233279

@@ -239,6 +285,57 @@ class PerformanceBenchmark {
239285
: sorted[mid];
240286
}
241287

288+
private removeOutliers(times: number[]): number[] {
289+
if (times.length < 10) return times; // Don't remove outliers from small datasets
290+
291+
const sorted = [...times].sort((a, b) => a - b);
292+
const q1Index = Math.floor(sorted.length * 0.25);
293+
const q3Index = Math.floor(sorted.length * 0.75);
294+
const q1 = sorted[q1Index];
295+
const q3 = sorted[q3Index];
296+
const iqr = q3 - q1;
297+
298+
// Use more conservative outlier detection (3x IQR instead of 1.5x)
299+
const lowerBound = q1 - 3 * iqr;
300+
const upperBound = q3 + 3 * iqr;
301+
302+
return times.filter(time => time >= lowerBound && time <= upperBound);
303+
}
304+
305+
private calculateStandardDeviation(times: number[]): number {
306+
const mean = times.reduce((a, b) => a + b, 0) / times.length;
307+
const squaredDeviations = times.map(time => Math.pow(time - mean, 2));
308+
const variance = squaredDeviations.reduce((a, b) => a + b, 0) / times.length;
309+
return Math.sqrt(variance);
310+
}
311+
312+
private calculateConfidenceInterval(times: number[]): { lower: number; upper: number } {
313+
const mean = times.reduce((a, b) => a + b, 0) / times.length;
314+
const stdDev = this.calculateStandardDeviation(times);
315+
const standardError = stdDev / Math.sqrt(times.length);
316+
317+
// 95% confidence interval using t-distribution approximation (1.96 for large samples)
318+
const marginOfError = 1.96 * standardError;
319+
320+
return {
321+
lower: mean - marginOfError,
322+
upper: mean + marginOfError
323+
};
324+
}
325+
326+
private calculateCoefficientOfVariation(times: number[]): number {
327+
const mean = times.reduce((a, b) => a + b, 0) / times.length;
328+
const stdDev = this.calculateStandardDeviation(times);
329+
return (stdDev / mean) * 100; // Return as percentage
330+
}
331+
332+
private getStabilityRating(coefficientOfVariation: number): string {
333+
if (coefficientOfVariation <= 5) return '🟢 Excellent';
334+
if (coefficientOfVariation <= 10) return '🟡 Good';
335+
if (coefficientOfVariation <= 20) return '🟠 Fair';
336+
return '🔴 Poor';
337+
}
338+
242339
async runFullBenchmark(): Promise<void> {
243340
console.log('🚀 Starting Performance Benchmark Suite');
244341
console.log(`\nUsing ${this.testFiles.length} real test files from system tests\n`);
@@ -264,33 +361,41 @@ class PerformanceBenchmark {
264361

265362
// Factory creation results
266363
const factory = this.results.factoryCreation;
364+
const factoryCv = this.calculateCoefficientOfVariation(factory.times);
267365
console.log(`\n🎯 Factory Creation:`);
268366
console.log(` Iterations: ${factory.iterations}`);
269-
console.log(` Average: ${factory.average.toFixed(3)}ms`);
270-
console.log(` Median: ${factory.median.toFixed(3)}ms`);
271-
console.log(` Min: ${factory.min.toFixed(3)}ms`);
272-
console.log(` Max: ${factory.max.toFixed(3)}ms`);
367+
console.log(` Average: ${factory.average.toFixed(6)}ms ± ${factory.standardDeviation.toFixed(6)}ms`);
368+
console.log(` Median: ${factory.median.toFixed(6)}ms`);
369+
console.log(` 95% CI: [${factory.confidenceInterval95.lower.toFixed(6)}, ${factory.confidenceInterval95.upper.toFixed(6)}]ms`);
370+
console.log(` Range: [${factory.min.toFixed(6)}, ${factory.max.toFixed(6)}]ms`);
371+
console.log(` Stability: ${factoryCv.toFixed(1)}% CV ${this.getStabilityRating(factoryCv)}`);
273372
console.log(` Total Duration: ${factory.totalDuration.toFixed(3)}ms`);
274373

275374
// Document formatting results
276375
for (const [size, results] of Object.entries(this.results.documentFormatting)) {
376+
const cv = this.calculateCoefficientOfVariation(results.times);
277377
console.log(`\n🎯 Document Formatting (${size}):`);
278378
const fileList = results.files.length <= 3
279379
? results.files.join(', ')
280380
: `${results.files.slice(0, 3).join(', ')}...`;
281381
console.log(` Test files: ${results.files.length} files (${fileList})`);
282382
console.log(` Iterations: ${results.iterations}`);
283-
console.log(` Average: ${results.average.toFixed(3)}ms`);
383+
console.log(` Average: ${results.average.toFixed(3)}ms ± ${results.standardDeviation.toFixed(3)}ms`);
284384
console.log(` Median: ${results.median.toFixed(3)}ms`);
285-
console.log(` Min: ${results.min.toFixed(3)}ms`);
286-
console.log(` Max: ${results.max.toFixed(3)}ms`);
385+
console.log(` 95% CI: [${results.confidenceInterval95.lower.toFixed(3)}, ${results.confidenceInterval95.upper.toFixed(3)}]ms`);
386+
console.log(` Range: [${results.min.toFixed(3)}, ${results.max.toFixed(3)}]ms`);
387+
console.log(` Stability: ${cv.toFixed(1)}% CV ${this.getStabilityRating(cv)}`);
287388
console.log(` Total Duration: ${results.totalDuration.toFixed(3)}ms`);
288389
}
289390

290391
console.log('\n' + '='.repeat(100));
291392
console.log(`⏱️ OVERALL BENCHMARK DURATION: ${this.results.overallDuration.toFixed(3)}ms`);
292393
console.log('='.repeat(100));
293-
console.log('💡 TIP: Run this benchmark before and after code changes to measure improvements!');
394+
console.log('💡 TIPS FOR CONSISTENT BENCHMARKING:');
395+
console.log(' • Close other applications to reduce system interference');
396+
console.log(' • Run multiple times and compare confidence intervals');
397+
console.log(' • Look for CV (Coefficient of Variation) < 10% for reliable measurements');
398+
console.log(' • Focus on trends across multiple runs rather than absolute values');
294399
console.log('='.repeat(100));
295400
}
296401

0 commit comments

Comments
 (0)