m/pkg/logtree/unraw: deflake tests
This test was flaking quite seriously on CI, due to races between the
supervisor's test harness logging a startup message and the test's own
logging exercise logic.
Fixing that, I also spotted a rare flake that occurs when a
NamedPipeReader restart races a write to the named pipe, possible
causing the write to be sent to the old but still running
NamedPipeReader. We fix that in the test, as fixing this in the code
itself is difficult and the resulting problem (a lost log line in this
rare race condition) isn't that bad.
Change-Id: If749798498acb9bf9e6557fd9cbcc441207b9726
Reviewed-on: https://review.monogon.dev/c/monogon/+/345
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/pkg/logtree/unraw/unraw.go b/metropolis/pkg/logtree/unraw/unraw.go
index 807d79b..8e319d2 100644
--- a/metropolis/pkg/logtree/unraw/unraw.go
+++ b/metropolis/pkg/logtree/unraw/unraw.go
@@ -113,6 +113,7 @@
if err != nil {
return fmt.Errorf("when opening named pipe: %w", err)
}
+ defer fifo.Close()
supervisor.Signal(ctx, supervisor.SignalHealthy)
for {
// Quit if requested.
diff --git a/metropolis/pkg/logtree/unraw/unraw_test.go b/metropolis/pkg/logtree/unraw/unraw_test.go
index faf7895..afd1da9 100644
--- a/metropolis/pkg/logtree/unraw/unraw_test.go
+++ b/metropolis/pkg/logtree/unraw/unraw_test.go
@@ -2,9 +2,11 @@
import (
"context"
+ "errors"
"fmt"
"io/ioutil"
"os"
+ "syscall"
"testing"
"source.monogon.dev/metropolis/pkg/logbuffer"
@@ -19,11 +21,11 @@
}
func TestNamedPipeReader(t *testing.T) {
- dir, err := ioutil.TempDir("", "metropolis-test-named-pipe-reader")
+ dir, err := ioutil.TempDir("/tmp", "metropolis-test-named-pipe-reader")
if err != nil {
t.Fatalf("could not create tempdir: %v", err)
}
- //defer os.RemoveAll(dir)
+ defer os.RemoveAll(dir)
fifoPath := dir + "/fifo"
// Start named pipe reader.
@@ -42,9 +44,14 @@
return r(ctx)
})
- // Wait until NamedPipeReader returns to make sure the fifo was created..
<-started
+ // Open FIFO...
+ f, err := os.OpenFile(fifoPath, os.O_WRONLY, 0)
+ if err != nil {
+ t.Fatalf("could not open fifo: %v", err)
+ }
+
// Start reading all logs.
reader, err := lt.Read("root", logtree.WithChildren(), logtree.WithStream())
if err != nil {
@@ -53,10 +60,6 @@
defer reader.Close()
// Write two lines to the fifo.
- f, err := os.OpenFile(fifoPath, os.O_RDWR, 0)
- if err != nil {
- t.Fatalf("could not open fifo: %v", err)
- }
fmt.Fprintf(f, "foo\nbar\n")
f.Close()
@@ -68,10 +71,34 @@
t.Errorf("expected second message to be %q, got %q", want, got)
}
- // Fully restart the entire hypervisor and pipe reader, redo test, things
+ // Fully restart the entire supervisor and pipe reader, redo test, things
// should continue to work.
stop()
+ // Block until FIFO isn't being read anymore. This ensures that the
+ // NamedPipeReader actually stopped running, otherwise the following write to
+ // the fifo can race by writing to the old NamedPipeReader and making the test
+ // time out. This can also happen in production, but that will just cause us to
+ // lose piped data in the very small race window when this can happen
+ // (statistically in this test, <0.1%).
+ //
+ // The check is being done by opening the FIFO in 'non-blocking mode', which
+ // returns ENXIO immediately if the FIFO has no corresponding writer, and
+ // succeeds otherwise.
+ for {
+ ft, err := os.OpenFile(fifoPath, os.O_WRONLY|syscall.O_NONBLOCK, 0)
+ if err == nil {
+ // There's still a writer, keep trying.
+ ft.Close()
+ } else if errors.Is(err, syscall.ENXIO) {
+ // No writer, break.
+ break
+ } else {
+ // Something else?
+ t.Fatalf("OpenFile(%q): %v", fifoPath, err)
+ }
+ }
+
started = make(chan struct{})
stop, lt = supervisor.TestHarness(t, func(ctx context.Context) error {
converter := Converter{
@@ -87,6 +114,8 @@
return r(ctx)
})
+ <-started
+
// Start reading all logs.
reader, err = lt.Read("root", logtree.WithChildren(), logtree.WithStream())
if err != nil {
@@ -94,11 +123,8 @@
}
defer reader.Close()
- <-started
-
// Write line to the fifo.
- // Write two lines to the fifo.
- f, err = os.OpenFile(fifoPath, os.O_RDWR, 0)
+ f, err = os.OpenFile(fifoPath, os.O_WRONLY, 0)
if err != nil {
t.Fatalf("could not open fifo: %v", err)
}
diff --git a/metropolis/pkg/supervisor/supervisor_testhelpers.go b/metropolis/pkg/supervisor/supervisor_testhelpers.go
index b2812c2..85361c2 100644
--- a/metropolis/pkg/supervisor/supervisor_testhelpers.go
+++ b/metropolis/pkg/supervisor/supervisor_testhelpers.go
@@ -52,6 +52,7 @@
Logger(ctx).Infof("Starting test %s...", t.Name())
if err := r(ctx); err != nil && !errors.Is(err, ctx.Err()) {
t.Errorf("Supervised runnable in harness returned error: %v", err)
+ return err
}
return nil
}, WithExistingLogtree(lt))