core/tests/e2e: wait for all subprocesses we created

Test Plan: `bazel test core/tests/e2e/... --runs_per_test=10`

X-Origin-Diff: phab/D548
GitOrigin-RevId: e7ed0d0f782fc38dfa94f83ded890187c6fd9c70
diff --git a/core/internal/launch/launch.go b/core/internal/launch/launch.go
index 9aa277c..a88e46d 100644
--- a/core/internal/launch/launch.go
+++ b/core/internal/launch/launch.go
@@ -21,6 +21,7 @@
 	"fmt"
 	"io"
 	"io/ioutil"
+	"log"
 	"net"
 	"os"
 	"os/exec"
@@ -211,17 +212,32 @@
 		qemuArgs = append(qemuArgs, "-no-reboot")
 	}
 
-	tpmCtx, tpmStop := context.WithCancel(
-		ctx)
-	tpmEmuCmd := exec.CommandContext(tpmCtx, "swtpm", "socket", "--tpm2", "--tpmstate", "dir="+tpmTargetDir, "--ctrl", "type=unixio,path="+tpmSocketPath)
-	systemCmd := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
+	// Start TPM emulator as a subprocess
+	tpmCtx, tpmCancel := context.WithCancel(ctx)
+	defer tpmCancel()
 
+	tpmEmuCmd := exec.CommandContext(tpmCtx, "swtpm", "socket", "--tpm2", "--tpmstate", "dir="+tpmTargetDir, "--ctrl", "type=unixio,path="+tpmSocketPath)
 	tpmEmuCmd.Stderr = os.Stderr
 	tpmEmuCmd.Stdout = os.Stdout
+
+	err = tpmEmuCmd.Start()
+	if err != nil {
+		return fmt.Errorf("failed to start TPM emulator: %w", err)
+	}
+
+	// Start the main qemu binary
+	systemCmd := exec.CommandContext(ctx, "qemu-system-x86_64", qemuArgs...)
 	systemCmd.Stderr = os.Stderr
 	systemCmd.Stdout = os.Stdout
-	go tpmEmuCmd.Run()
+
 	err = systemCmd.Run()
-	tpmStop()
-	return err
+
+	// Stop TPM emulator and wait for it to exit to properly reap the child process
+	tpmCancel()
+	log.Print("Waiting for TPM emulator to exit")
+	// Wait returns a SIGKILL error because we just cancelled its context.
+	// We still need to call it to avoid creating zombies.
+	_ = tpmEmuCmd.Wait()
+
+	return nil
 }
diff --git a/core/tests/e2e/main_test.go b/core/tests/e2e/main_test.go
index d400b9b..b59ed7d 100644
--- a/core/tests/e2e/main_test.go
+++ b/core/tests/e2e/main_test.go
@@ -24,7 +24,6 @@
 	"net/http"
 	_ "net/http"
 	_ "net/http/pprof"
-	"os"
 	"testing"
 	"time"
 
@@ -46,15 +45,18 @@
 	}()
 	// Set a global timeout to make sure this terminates
 	ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
-	defer cancel()
 	portMap, err := launch.ConflictFreePortMap()
 	if err != nil {
 		t.Fatalf("Failed to acquire ports for e2e test: %v", err)
 	}
+
+	procExit := make(chan struct{})
+
 	go func() {
 		if err := launch.Launch(ctx, launch.Options{Ports: portMap}); err != nil {
 			panic(err)
 		}
+		close(procExit)
 	}()
 	grpcClient, err := portMap.DialGRPC(common.DebugServicePort, grpc.WithInsecure())
 	if err != nil {
@@ -62,11 +64,6 @@
 	}
 	debugClient := apipb.NewNodeDebugServiceClient(grpcClient)
 
-	go func() {
-		<-ctx.Done()
-		fmt.Fprintf(os.Stderr, "Main context canceled\n")
-	}()
-
 	// This exists to keep the parent around while all the children race
 	// It currently tests both a set of OS-level conditions and Kubernetes Deployments and StatefulSets
 	t.Run("RunGroup", func(t *testing.T) {
@@ -166,4 +163,9 @@
 			})
 		})
 	})
+
+	// Cancel the main context and wait for our subprocesses to exit
+	// to avoid leaking them and blocking the parent.
+	cancel()
+	<-procExit
 }