From f4e37b8afc01253567fddbdd68ec35632df86b62 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek <mknyszek@google.com> Date: Thu, 8 May 2025 19:03:37 +0000 Subject: [PATCH] crypto/tls: use runtime.Gosched instead of time.After in TestCertCache I noticed a failure of this test on a linux/amd64 builder and reproduced it locally. I can only really reproduce it in a stress test when I overload my system (`stress2 ./tls.test -test.run=TestCertCache`) but this points to the root of the problem: it's possible for a timer to get delayed and the timeout fires before we ever get the chance to check. After copious debugging printlns, this is essentially what I'd observed. There would only be one failed check of the reference count from before it was updated. Change the test to be a busy-loop again, but call runtime.Gosched. This is also what we do for the os.Root tests, and in hindsight should've been my go-to. This has a much higher likelihood of executing promptly. We may want to go back and understand why the 1 ms timer would fire so hilariously late the second time. This might be a real bug. For now, this change makes the test more stable. It no longer fails when it's hammered under `stress2`. Fixes #73637. Change-Id: I316bd9e30946f4c055e61d179c4efc5fe029c608 Reviewed-on: https://go-review.googlesource.com/c/go/+/671175 Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> --- src/crypto/tls/cache_test.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/crypto/tls/cache_test.go b/src/crypto/tls/cache_test.go index 66854299dfa..ea6b726d5e5 100644 --- a/src/crypto/tls/cache_test.go +++ b/src/crypto/tls/cache_test.go @@ -41,22 +41,12 @@ func TestCertCache(t *testing.T) { timeoutRefCheck := func(t *testing.T, key string, count int64) { t.Helper() - - // Explicitly check every 1 ms up to the timeout instead of busy-looping. - // - // On single-threaded platforms like js/wasm a busy-loop might - // never call into the scheduler for the full timeout, meaning - // that if we arrive here and the cleanup hasn't already run, - // we'll simply loop until the timeout. Busy-loops put us at the - // mercy of the Go scheduler, making this test fragile on some - // platforms. timeout := time.After(4 * time.Second) - check := time.After(1 * time.Millisecond) for { select { case <-timeout: t.Fatal("timed out waiting for expected ref count") - case <-check: + default: e, ok := cc.Load(key) if !ok && count != 0 { t.Fatal("cache does not contain expected key") @@ -68,6 +58,15 @@ func TestCertCache(t *testing.T) { return } } + // Explicitly yield to the scheduler. + // + // On single-threaded platforms like js/wasm a busy-loop might + // never call into the scheduler for the full timeout, meaning + // that if we arrive here and the cleanup hasn't already run, + // we'll simply loop until the timeout. Busy-loops put us at the + // mercy of the Go scheduler, making this test fragile on some + // platforms. + runtime.Gosched() } } -- GitLab