m/node: refactor panic handling
This change significantly changes how we handle panics and runtime
errors in our core process. The explicit panic handler is gone and has
been replaced by a file storing the panic persistently and
the informational message has been moved out to minit.
The runtime log file is stored on the ESP to allow for debugging if the
node crashes before unlocking and gets reset every boot. It also dumps
its previous state into the logtree to allow administrators to look into
these errors without launching another OS to dump the file.
Change-Id: I3503eeced2da0bbcb6301a6c39e502bbb9afa827
Reviewed-on: https://review.monogon.dev/c/monogon/+/772
Tested-by: Jenkins CI
Reviewed-by: Sergiusz Bazanski <serge@monogon.tech>
diff --git a/metropolis/node/core/minit/main.c b/metropolis/node/core/minit/main.c
index cee0878..4677c8e 100644
--- a/metropolis/node/core/minit/main.c
+++ b/metropolis/node/core/minit/main.c
@@ -157,19 +157,18 @@
cprintf("child status not EXITED nor SIGNALED: %d\n", status);
exit_status = 1;
}
- }
- // Direct child exited, let's also exit.
- if (exit_status >= 0) {
- cprintf("\n Metropolis core exited with status: %d\n", exit_status);
- sync();
- if (exit_status != 0) {
- cprintf(" Disks synced, rebooting in 30 seconds...\n", exit_status);
- sleep(30);
- cprintf(" Rebooting...\n\n", exit_status);
- } else {
+ // Direct child exited, let's also exit.
+ if (exit_status >= 0) {
+ if (exit_status == 0) {
+ reboot(LINUX_REBOOT_CMD_RESTART);
+ return;
+ }
+ cprintf("\n Metropolis encountered an uncorrectable error and this node must be restarted.\n");
+ cprintf("core exit status: %d\n", exit_status);
+ sync();
cprintf(" Disks synced, rebooting...\n\n");
+ reboot(LINUX_REBOOT_CMD_RESTART);
}
- reboot(LINUX_REBOOT_CMD_RESTART);
}
}