Skip to content

Commit 8a2d459

Browse files
bndctsmholt
andauthored
fix: handshake worker starve when loading external certificate (#369)
Co-authored-by: Matt Holt <mholt@users.noreply.github.com>
1 parent 6902925 commit 8a2d459

File tree

1 file changed

+38
-17
lines changed

1 file changed

+38
-17
lines changed

handshake.go

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -298,9 +298,9 @@ func (cfg *Config) getCertDuringHandshake(ctx context.Context, hello *tls.Client
298298
// domain, avoid pounding manager or storage thousands of times simultaneously. We use a similar sync
299299
// strategy for obtaining certificate during handshake.
300300
certLoadWaitChansMu.Lock()
301-
wait, ok := certLoadWaitChans[name]
301+
waiter, ok := certLoadWaitChans[name]
302302
if ok {
303-
// another goroutine is already loading the cert; just wait and we'll get it from the in-memory cache
303+
// another goroutine is already loading the cert; just wait
304304
certLoadWaitChansMu.Unlock()
305305

306306
timeout := time.NewTimer(2 * time.Minute)
@@ -310,33 +310,44 @@ func (cfg *Config) getCertDuringHandshake(ctx context.Context, hello *tls.Client
310310
case <-ctx.Done():
311311
timeout.Stop()
312312
return Certificate{}, ctx.Err()
313-
case <-wait:
313+
case <-waiter.done:
314314
timeout.Stop()
315315
}
316316

317-
return cfg.getCertDuringHandshake(ctx, hello, false)
318-
} else {
319-
// no other goroutine is currently trying to load this cert
320-
wait = make(chan struct{})
321-
certLoadWaitChans[name] = wait
322-
certLoadWaitChansMu.Unlock()
317+
// If the leader got a result from an external cert manager, use it
318+
// directly — these certs are not added to the cache, so a recursive
319+
// cache lookup would miss. For cached certs (on-demand, managed),
320+
// the waiter result will be empty and we fall through to the
321+
// original recursive lookup.
322+
if !waiter.cert.Empty() || waiter.err != nil {
323+
return waiter.cert, waiter.err
324+
}
323325

324-
// unblock others and clean up when we're done
325-
defer func() {
326-
certLoadWaitChansMu.Lock()
327-
close(wait)
328-
delete(certLoadWaitChans, name)
329-
certLoadWaitChansMu.Unlock()
330-
}()
326+
return cfg.getCertDuringHandshake(ctx, hello, false)
331327
}
332328

329+
// no other goroutine is currently trying to load this cert
330+
waiter = &certLoadWaiter{done: make(chan struct{})}
331+
certLoadWaitChans[name] = waiter
332+
certLoadWaitChansMu.Unlock()
333+
334+
// unblock others and clean up when we're done
335+
defer func() {
336+
certLoadWaitChansMu.Lock()
337+
close(waiter.done)
338+
delete(certLoadWaitChans, name)
339+
certLoadWaitChansMu.Unlock()
340+
}()
341+
333342
// If an external Manager is configured, try to get it from them.
334343
// Only continue to use our own logic if it returns empty+nil.
335344
externalCert, err := cfg.getCertFromAnyCertManager(ctx, hello, logger)
336345
if err != nil {
346+
waiter.err = err
337347
return Certificate{}, err
338348
}
339349
if !externalCert.Empty() {
350+
waiter.cert = externalCert
340351
return externalCert, nil
341352
}
342353

@@ -946,9 +957,19 @@ var (
946957
obtainCertWaitChansMu sync.Mutex
947958
)
948959

960+
// certLoadWaiter coordinates concurrent certificate loading for the same name.
961+
// The leader populates the result and closes the channel; waiters read the result
962+
// after the channel is closed. This allows externally-managed certificates (which
963+
// are not cached) to be shared directly with waiting goroutines.
964+
type certLoadWaiter struct {
965+
done chan struct{}
966+
cert Certificate
967+
err error
968+
}
969+
949970
// TODO: this lockset should probably be per-cache
950971
var (
951-
certLoadWaitChans = make(map[string]chan struct{})
972+
certLoadWaitChans = make(map[string]*certLoadWaiter)
952973
certLoadWaitChansMu sync.Mutex
953974
)
954975

0 commit comments

Comments
 (0)