m/node: refactor panic handling
This change significantly changes how we handle panics and runtime
errors in our core process. The explicit panic handler is gone and has
been replaced by a file storing the panic persistently and
the informational message has been moved out to minit.
The runtime log file is stored on the ESP to allow for debugging if the
node crashes before unlocking and gets reset every boot. It also dumps
its previous state into the logtree to allow administrators to look into
these errors without launching another OS to dump the file.
Change-Id: I3503eeced2da0bbcb6301a6c39e502bbb9afa827
Reviewed-on: https://review.monogon.dev/c/monogon/+/772
Tested-by: Jenkins CI
Reviewed-by: Sergiusz Bazanski <serge@monogon.tech>
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index eb674ae..ca71fa7 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -21,7 +21,6 @@
"fmt"
"io"
"os"
- "runtime/debug"
"golang.org/x/sys/unix"
@@ -38,22 +37,6 @@
)
func main() {
- defer func() {
- if r := recover(); r != nil {
- fmt.Fprintf(os.Stderr, "\n\n")
- fmt.Fprintf(os.Stderr, " Metropolis encountered an uncorrectable error and this node must be restarted.\n")
- fmt.Fprintf(os.Stderr, " Core panicked: %v\n\n", r)
- debug.PrintStack()
- }
- unix.Sync()
- // TODO(lorenz): Switch this to Reboot when init panics are less likely.
- if err := unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF); err != nil {
- // Best effort, nothing we can do if this fails except printing the error to the
- // console.
- panic(fmt.Sprintf("failed to halt node: %v\n", err))
- }
- }()
-
// Set up basic mounts (like /dev, /sys...).
if err := setupMounts(); err != nil {
panic(fmt.Errorf("could not set up basic mounts: %w", err))
@@ -81,6 +64,8 @@
}
}(p, f)
}
+ // Initialize persistent panic handler early
+ initPanicHandler(lt)
// Initial logger. Used until we get to a supervisor.
logger := lt.MustLeveledFor("init")