m/n/core: run dedicated PID 1 reaper

This introduces minit, a tiny init implementation, written in C, built
against musl. It does one thing: reap children. No support for TTY, no
configurability, just the bare minimum for a working system.

We also drive-by remove some dead code from main.go.

This solves https://github.com/monogon-dev/monogon/issues/15

Change-Id: I666ff2042f19639465ff918590a39b8e219ee7d6
Reviewed-on: https://review.monogon.dev/c/monogon/+/346
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/BUILD.bazel b/metropolis/node/BUILD.bazel
index db01806..da5fb8a 100644
--- a/metropolis/node/BUILD.bazel
+++ b/metropolis/node/BUILD.bazel
@@ -35,7 +35,7 @@
         "/data",
     ],
     files = {
-        "//metropolis/node/core": "/init",
+        "//metropolis/node/core": "/core",
 
         # CA Certificate bundle & os-release & resolv.conf
         # These should not be explicitly used by Metropolis code and are only here for compatibility with
@@ -79,6 +79,7 @@
         "@com_github_go_delve_delve//cmd/dlv:dlv": "/dlv",
     },
     files_cc = {
+        "//metropolis/node/core/minit": "/init",
         # runc runtime, with cgo
         "@com_github_opencontainers_runc//:runc": "/containerd/bin/runc",
         "@xfsprogs//:mkfs": "/bin/mkfs.xfs",
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index fa768c2..5d495fa 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -18,15 +18,9 @@
 
 import (
 	"context"
-	"crypto/ed25519"
-	"crypto/rand"
-	"crypto/x509"
 	"fmt"
-	"log"
-	"math/big"
 	"net"
 	"os"
-	"os/signal"
 	"runtime/debug"
 	"time"
 
@@ -51,14 +45,16 @@
 func main() {
 	defer func() {
 		if r := recover(); r != nil {
-			fmt.Println("Init panicked:", r)
+			fmt.Fprintf(os.Stderr, "\n\n")
+			fmt.Fprintf(os.Stderr, "  Metropolis encountered an uncorrectable error and this node must be restarted.\n")
+			fmt.Fprintf(os.Stderr, "  Core panicked: %v\n\n", r)
 			debug.PrintStack()
 		}
 		unix.Sync()
-		// TODO(lorenz): Switch this to Reboot when init panics are less likely
-		// Best effort, nothing we can do if this fails except printing the
-		// error to the console.
+		// TODO(lorenz): Switch this to Reboot when init panics are less likely.
 		if err := unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF); err != nil {
+			// Best effort, nothing we can do if this fails except printing the error to the
+			// console.
 			panic(fmt.Sprintf("failed to halt node: %v\n", err))
 		}
 	}()
@@ -93,9 +89,6 @@
 
 	logger.Info("Starting Metropolis node init")
 
-	signalChannel := make(chan os.Signal, 2)
-	signal.Notify(signalChannel)
-
 	if err := tpm.Initialize(logger); err != nil {
 		logger.Fatalf("Failed to initialize TPM 2.0: %v", err)
 	}
@@ -114,8 +107,7 @@
 	}
 
 	// trapdoor is a channel used to signal to the init service that a very
-	// low-level, unrecoverable failure occured. This causes a GURU MEDITATION
-	// ERROR visible to the end user.
+	// low-level, unrecoverable failure occured.
 	trapdoor := make(chan struct{})
 
 	// Make context for supervisor. We cancel it when we reach the trapdoor.
@@ -242,87 +234,7 @@
 		return nil
 	}, supervisor.WithExistingLogtree(lt))
 
-	// We're PID1, so orphaned processes get reparented to us to clean up
-	for {
-		select {
-		case <-trapdoor:
-			// If the trapdoor got closed, we got stuck early enough in the
-			// boot process that we can't do anything about it. Display a
-			// generic error message until we handle error conditions better.
-			ctxC()
-			log.Printf("                  ########################")
-			log.Printf("                  # GURU MEDIATION ERROR #")
-			log.Printf("                  ########################")
-			log.Printf("")
-			log.Printf("Metropolis encountered an uncorrectable error and this node must be")
-			log.Printf("restarted.")
-			log.Printf("")
-			log.Printf("(Error condition: init trapdoor closed)")
-			log.Printf("")
-			select {}
-
-		case sig := <-signalChannel:
-			switch sig {
-			case unix.SIGCHLD:
-				var status unix.WaitStatus
-				var rusage unix.Rusage
-				for {
-					res, err := unix.Wait4(-1, &status, unix.WNOHANG, &rusage)
-					if err != nil && err != unix.ECHILD {
-						logger.Errorf("Failed to wait on orphaned child: %v", err)
-						break
-					}
-					if res <= 0 {
-						break
-					}
-				}
-			case unix.SIGURG:
-				// Go 1.14 introduced asynchronous preemption, which uses
-				// SIGURG.
-				// In order not to break backwards compatibility in the
-				// unlikely case of an application actually using SIGURG on its
-				// own, they're not filtering them.
-				// (https://github.com/golang/go/issues/37942)
-				logger.V(5).Info("Ignoring SIGURG")
-			// TODO(lorenz): We can probably get more than just SIGCHLD as init, but I can't think
-			// of any others right now, just log them in case we hit any of them.
-			default:
-				logger.Warningf("Got unexpected signal %s", sig.String())
-			}
-		}
-	}
-}
-
-// nodeCertificate creates a node key/certificate for a foreign node. This is
-// duplicated code with localstorage's PKIDirectory EnsureSelfSigned, but is
-// temporary (and specific to 'golden tickets').
-func (s *debugService) nodeCertificate() (cert, key []byte, err error) {
-	pubKey, privKey, err := ed25519.GenerateKey(rand.Reader)
-	if err != nil {
-		err = fmt.Errorf("failed to generate key: %w", err)
-		return
-	}
-
-	key, err = x509.MarshalPKCS8PrivateKey(privKey)
-	if err != nil {
-		err = fmt.Errorf("failed to marshal key: %w", err)
-		return
-	}
-
-	serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 127)
-	serialNumber, err := rand.Int(rand.Reader, serialNumberLimit)
-	if err != nil {
-		err = fmt.Errorf("failed to generate serial number: %w", err)
-		return
-	}
-
-	template := localstorage.CertificateForNode(pubKey)
-	template.SerialNumber = serialNumber
-
-	cert, err = x509.CreateCertificate(rand.Reader, &template, &template, pubKey, privKey)
-	if err != nil {
-		err = fmt.Errorf("could not sign certificate: %w", err)
-		return
-	}
-	return
+	<-trapdoor
+	logger.Infof("Trapdoor closed, exiting core.")
+	ctxC()
 }
diff --git a/metropolis/node/core/minit/BUILD.bazel b/metropolis/node/core/minit/BUILD.bazel
new file mode 100644
index 0000000..573ca92
--- /dev/null
+++ b/metropolis/node/core/minit/BUILD.bazel
@@ -0,0 +1,9 @@
+cc_binary(
+    name = "minit",
+    srcs = [
+        "main.c",
+    ],
+    visibility = [
+        "//metropolis/node:__pkg__",
+    ],
+)
diff --git a/metropolis/node/core/minit/main.c b/metropolis/node/core/minit/main.c
new file mode 100644
index 0000000..f2611b4
--- /dev/null
+++ b/metropolis/node/core/minit/main.c
@@ -0,0 +1,132 @@
+// minit is a barebones Linux-compatible init (PID 1) process.
+//
+// Its goal is to run the Metropolis core executable and reap any children that
+// it stumbles upon. It does not support running under a TTY and is not
+// configurable in any way.
+//
+// The only reason this exists is because Go's child process reaping (when
+// using os/exec.Command) races any PID 1 process reaping, thereby preventing
+// running a complex Go binary as PID 1. In the future this might be rewritten
+// in a memory-safe language like Zig or Rust, but this implementation will do
+// for now, as long as it keeps having basically zero attack surface.
+//
+// This code has been vaguely inspired by github.com/Yelp/dumb-init and
+// github.com/krallin/tini, two already existing minimal init implementations.
+// These, however, attempt to handle being run in a TTY and some
+// configurability, as they're meant to be run in containers. We don't need any
+// of that, and we'd rather have as little C as possible.
+
+#include <errno.h>
+#include <linux/reboot.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/reboot.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+void handle_signal(pid_t child_pid, int signum);
+
+int main() {
+    // Block all signals. We'll unblock them in the child.
+    sigset_t all_signals;
+    sigfillset(&all_signals);
+    sigprocmask(SIG_BLOCK, &all_signals, NULL);
+
+    // Say hello.
+    fprintf(stderr,
+        "\n"
+        "  Metropolis Cluster Operating System\n"
+        "  Copyright 2020-2021 The Monogon Project Authors\n"
+        "\n"
+    );
+
+
+    pid_t pid = fork();
+    if (pid < 0) {
+        fprintf(stderr, "fork(): %s\n", strerror(errno));
+        return 1;
+    }
+
+    if (pid == 0) {
+        // In the child. Unblock all signals.
+        sigprocmask(SIG_UNBLOCK, &all_signals, NULL);
+        if (setsid() == -1) {
+            fprintf(stderr, "setsid: %s\n", strerror(errno));
+            return 1;
+        }
+
+        // Then, start the core executable.
+        char *argv[] = {
+            "/core",
+            NULL,
+        };
+        execvp(argv[0], argv);
+        fprintf(stderr, "execvpe(/core) failed: %s\n", strerror(errno));
+        return 1;
+    }
+
+    // In the parent. Wait for any signal, then handle it and any other pending
+    // ones.
+    for (;;) {
+        int signum;
+        sigwait(&all_signals, &signum);
+        handle_signal(pid, signum);
+    }
+}
+
+// handle_signal is called by the main reap loop for every signal received. It
+// reaps children if SIGCHLD is received, and otherwise dispatches the signal to
+// its direct child.
+void handle_signal(pid_t child_pid, int signum) {
+    // Anything other than SIGCHLD should just be forwarded to the child.
+    if (signum != SIGCHLD) {
+        kill(-child_pid, signum);
+        return;
+    }
+
+    // A SIGCHLD was received. Go through all children and reap them, checking
+    // if any of them is our direct child.
+
+    // exit_status will be set if the direct child process exited.
+    int exit_status = -1;
+
+    pid_t killed_pid;
+    int status;
+    while ((killed_pid = waitpid(-1, &status, WNOHANG)) > 0) {
+        if (killed_pid != child_pid) {
+            // Something else than our direct child died, just reap it.
+            continue;
+        }
+
+        // Our direct child exited. Translate its status into an exit code.
+        if (WIFEXITED(status)) {
+            // For processes which exited, just use the exit code directly.
+            exit_status = WEXITSTATUS(status);
+        } else if (WIFSIGNALED(status)) {
+            // Otherwise, emulate what sh/bash do and return 128 + the signal
+            // number that the child received.
+            exit_status = 128 + WTERMSIG(status);
+        } else {
+            // Something unexpected happened. Attempt to handle this gracefully,
+            // but complain.
+            fprintf(stderr, "child status not EXITED nor SIGNALED: %d\n", status);
+            exit_status = 1;
+        }
+    }
+
+    // Direct child exited, let's also exit.
+    if (exit_status >= 0) {
+        fprintf(stderr, "\n  Metropolis core exited with status: %d\n", exit_status);
+        sync();
+        if (exit_status != 0) {
+            fprintf(stderr, "  Disks synced, rebooting in 30 seconds...\n", exit_status);
+            sleep(30);
+            fprintf(stderr, "  Rebooting...\n\n", exit_status);
+        } else {
+            fprintf(stderr, "  Disks synced, rebooting...\n\n");
+        }
+        reboot(LINUX_REBOOT_CMD_RESTART);
+    }
+}