m/n/core: run dedicated PID 1 reaper
This introduces minit, a tiny init implementation, written in C, built
against musl. It does one thing: reap children. No support for TTY, no
configurability, just the bare minimum for a working system.
We also drive-by remove some dead code from main.go.
This solves https://github.com/monogon-dev/monogon/issues/15
Change-Id: I666ff2042f19639465ff918590a39b8e219ee7d6
Reviewed-on: https://review.monogon.dev/c/monogon/+/346
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
diff --git a/metropolis/node/core/minit/BUILD.bazel b/metropolis/node/core/minit/BUILD.bazel
new file mode 100644
index 0000000..573ca92
--- /dev/null
+++ b/metropolis/node/core/minit/BUILD.bazel
@@ -0,0 +1,9 @@
+cc_binary(
+ name = "minit",
+ srcs = [
+ "main.c",
+ ],
+ visibility = [
+ "//metropolis/node:__pkg__",
+ ],
+)
diff --git a/metropolis/node/core/minit/main.c b/metropolis/node/core/minit/main.c
new file mode 100644
index 0000000..f2611b4
--- /dev/null
+++ b/metropolis/node/core/minit/main.c
@@ -0,0 +1,132 @@
+// minit is a barebones Linux-compatible init (PID 1) process.
+//
+// Its goal is to run the Metropolis core executable and reap any children that
+// it stumbles upon. It does not support running under a TTY and is not
+// configurable in any way.
+//
+// The only reason this exists is because Go's child process reaping (when
+// using os/exec.Command) races any PID 1 process reaping, thereby preventing
+// running a complex Go binary as PID 1. In the future this might be rewritten
+// in a memory-safe language like Zig or Rust, but this implementation will do
+// for now, as long as it keeps having basically zero attack surface.
+//
+// This code has been vaguely inspired by github.com/Yelp/dumb-init and
+// github.com/krallin/tini, two already existing minimal init implementations.
+// These, however, attempt to handle being run in a TTY and some
+// configurability, as they're meant to be run in containers. We don't need any
+// of that, and we'd rather have as little C as possible.
+
+#include <errno.h>
+#include <linux/reboot.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/reboot.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+void handle_signal(pid_t child_pid, int signum);
+
+int main() {
+ // Block all signals. We'll unblock them in the child.
+ sigset_t all_signals;
+ sigfillset(&all_signals);
+ sigprocmask(SIG_BLOCK, &all_signals, NULL);
+
+ // Say hello.
+ fprintf(stderr,
+ "\n"
+ " Metropolis Cluster Operating System\n"
+ " Copyright 2020-2021 The Monogon Project Authors\n"
+ "\n"
+ );
+
+
+ pid_t pid = fork();
+ if (pid < 0) {
+ fprintf(stderr, "fork(): %s\n", strerror(errno));
+ return 1;
+ }
+
+ if (pid == 0) {
+ // In the child. Unblock all signals.
+ sigprocmask(SIG_UNBLOCK, &all_signals, NULL);
+ if (setsid() == -1) {
+ fprintf(stderr, "setsid: %s\n", strerror(errno));
+ return 1;
+ }
+
+ // Then, start the core executable.
+ char *argv[] = {
+ "/core",
+ NULL,
+ };
+ execvp(argv[0], argv);
+ fprintf(stderr, "execvpe(/core) failed: %s\n", strerror(errno));
+ return 1;
+ }
+
+ // In the parent. Wait for any signal, then handle it and any other pending
+ // ones.
+ for (;;) {
+ int signum;
+ sigwait(&all_signals, &signum);
+ handle_signal(pid, signum);
+ }
+}
+
+// handle_signal is called by the main reap loop for every signal received. It
+// reaps children if SIGCHLD is received, and otherwise dispatches the signal to
+// its direct child.
+void handle_signal(pid_t child_pid, int signum) {
+ // Anything other than SIGCHLD should just be forwarded to the child.
+ if (signum != SIGCHLD) {
+ kill(-child_pid, signum);
+ return;
+ }
+
+ // A SIGCHLD was received. Go through all children and reap them, checking
+ // if any of them is our direct child.
+
+ // exit_status will be set if the direct child process exited.
+ int exit_status = -1;
+
+ pid_t killed_pid;
+ int status;
+ while ((killed_pid = waitpid(-1, &status, WNOHANG)) > 0) {
+ if (killed_pid != child_pid) {
+ // Something else than our direct child died, just reap it.
+ continue;
+ }
+
+ // Our direct child exited. Translate its status into an exit code.
+ if (WIFEXITED(status)) {
+ // For processes which exited, just use the exit code directly.
+ exit_status = WEXITSTATUS(status);
+ } else if (WIFSIGNALED(status)) {
+ // Otherwise, emulate what sh/bash do and return 128 + the signal
+ // number that the child received.
+ exit_status = 128 + WTERMSIG(status);
+ } else {
+ // Something unexpected happened. Attempt to handle this gracefully,
+ // but complain.
+ fprintf(stderr, "child status not EXITED nor SIGNALED: %d\n", status);
+ exit_status = 1;
+ }
+ }
+
+ // Direct child exited, let's also exit.
+ if (exit_status >= 0) {
+ fprintf(stderr, "\n Metropolis core exited with status: %d\n", exit_status);
+ sync();
+ if (exit_status != 0) {
+ fprintf(stderr, " Disks synced, rebooting in 30 seconds...\n", exit_status);
+ sleep(30);
+ fprintf(stderr, " Rebooting...\n\n", exit_status);
+ } else {
+ fprintf(stderr, " Disks synced, rebooting...\n\n");
+ }
+ reboot(LINUX_REBOOT_CMD_RESTART);
+ }
+}