core/tests/e2e: wait for all subprocesses we created
Test Plan: `bazel test core/tests/e2e/... --runs_per_test=10`
X-Origin-Diff: phab/D548
GitOrigin-RevId: e7ed0d0f782fc38dfa94f83ded890187c6fd9c70
diff --git a/core/internal/launch/launch.go b/core/internal/launch/launch.go
index 9aa277c..a88e46d 100644
--- a/core/internal/launch/launch.go
+++ b/core/internal/launch/launch.go
@@ -21,6 +21,7 @@
"fmt"
"io"
"io/ioutil"
+ "log"
"net"
"os"
"os/exec"
@@ -211,17 +212,32 @@
qemuArgs = append(qemuArgs, "-no-reboot")
}
- tpmCtx, tpmStop := context.WithCancel(
- ctx)
- tpmEmuCmd := exec.CommandContext(tpmCtx, "swtpm", "socket", "--tpm2", "--tpmstate", "dir="+tpmTargetDir, "--ctrl", "type=unixio,path="+tpmSocketPath)
- systemCmd := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
+ // Start TPM emulator as a subprocess
+ tpmCtx, tpmCancel := context.WithCancel(ctx)
+ defer tpmCancel()
+ tpmEmuCmd := exec.CommandContext(tpmCtx, "swtpm", "socket", "--tpm2", "--tpmstate", "dir="+tpmTargetDir, "--ctrl", "type=unixio,path="+tpmSocketPath)
tpmEmuCmd.Stderr = os.Stderr
tpmEmuCmd.Stdout = os.Stdout
+
+ err = tpmEmuCmd.Start()
+ if err != nil {
+ return fmt.Errorf("failed to start TPM emulator: %w", err)
+ }
+
+ // Start the main qemu binary
+ systemCmd := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
systemCmd.Stderr = os.Stderr
systemCmd.Stdout = os.Stdout
- go tpmEmuCmd.Run()
+
err = systemCmd.Run()
- tpmStop()
- return err
+
+ // Stop TPM emulator and wait for it to exit to properly reap the child process
+ tpmCancel()
+ log.Print("Waiting for TPM emulator to exit")
+ // Wait returns a SIGKILL error because we just cancelled its context.
+ // We still need to call it to avoid creating zombies.
+ _ = tpmEmuCmd.Wait()
+
+ return nil
}
diff --git a/core/tests/e2e/main_test.go b/core/tests/e2e/main_test.go
index d400b9b..b59ed7d 100644
--- a/core/tests/e2e/main_test.go
+++ b/core/tests/e2e/main_test.go
@@ -24,7 +24,6 @@
"net/http"
_ "net/http"
_ "net/http/pprof"
- "os"
"testing"
"time"
@@ -46,15 +45,18 @@
}()
// Set a global timeout to make sure this terminates
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
- defer cancel()
portMap, err := launch.ConflictFreePortMap()
if err != nil {
t.Fatalf("Failed to acquire ports for e2e test: %v", err)
}
+
+ procExit := make(chan struct{})
+
go func() {
if err := launch.Launch(ctx, launch.Options{Ports: portMap}); err != nil {
panic(err)
}
+ close(procExit)
}()
grpcClient, err := portMap.DialGRPC(common.DebugServicePort, grpc.WithInsecure())
if err != nil {
@@ -62,11 +64,6 @@
}
debugClient := apipb.NewNodeDebugServiceClient(grpcClient)
- go func() {
- <-ctx.Done()
- fmt.Fprintf(os.Stderr, "Main context canceled\n")
- }()
-
// This exists to keep the parent around while all the children race
// It currently tests both a set of OS-level conditions and Kubernetes Deployments and StatefulSets
t.Run("RunGroup", func(t *testing.T) {
@@ -166,4 +163,9 @@
})
})
})
+
+ // Cancel the main context and wait for our subprocesses to exit
+ // to avoid leaking them and blocking the parent.
+ cancel()
+ <-procExit
}