m/node: use pstore for panic storage

The old solution never worked as the ESP was not mounted at that stage.
In general storing crash data there is suboptimal as it makes lots of
assumptions about the system state.

For kernel crashes we already use pstore and there is an interface for
storing userspace messages in pstore as well. Set up the panic handler
to put its logs in there and extend the pstore cleanup runnable to also
dump that part of pstore into the logtree after reboot.

In most cases this also requires a kernel patch as most pstore backends
to not allow userspace messages, probably to preserve limited space.
Since we always clean pstore after reboot, this should be fine.

Change-Id: I011109112e7bfd24d1772d5853a1d491c0cfd026
Reviewed-on: https://review.monogon.dev/c/monogon/+/2753
Reviewed-by: Serge Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/panichandler.go b/metropolis/node/core/panichandler.go
index 3f17ffc..7a6534b 100644
--- a/metropolis/node/core/panichandler.go
+++ b/metropolis/node/core/panichandler.go
@@ -9,7 +9,6 @@
 package main
 
 import (
-	"io"
 	"os"
 	"unsafe"
 
@@ -55,30 +54,13 @@
 	return int32(err)
 }
 
-const runtimeLogPath = "/esp/core_runtime.log"
-
 func initPanicHandler(lt *logtree.LogTree, consoles []console) {
-	rl := lt.MustRawFor("panichandler")
 	l := lt.MustLeveledFor("panichandler")
 
-	runtimeLogFile, err := os.Open(runtimeLogPath)
-	if err != nil && !os.IsNotExist(err) {
-		l.Errorf("Failed to open runtimeLogFile: %v", err)
-	}
-	if err == nil {
-		if _, err := io.Copy(rl, runtimeLogFile); err != nil {
-			l.Errorf("Failed to log old persistent crash: %v", err)
-		}
-		runtimeLogFile.Close()
-		if err := os.Remove(runtimeLogPath); err != nil {
-			l.Errorf("Failed to delete old persistent runtime crash log: %v", err)
-		}
-	}
-
-	// Setup ESP file.
-	fd, err := unix.Open(runtimeLogPath, os.O_CREATE|os.O_WRONLY, 0)
+	// Setup pstore userspace message buffer
+	fd, err := unix.Open("/dev/pmsg0", os.O_WRONLY, 0)
 	if err != nil {
-		l.Errorf("Failed to open core runtime log file: %v", err)
+		l.Errorf("Failed to open pstore userspace device (pstore probably unavailable): %v", err)
 		l.Warningf("Continuing without persistent panic storage.")
 	} else {
 		runtimeFds = append(runtimeFds, fd)
diff --git a/metropolis/node/core/pstore.go b/metropolis/node/core/pstore.go
index 857488a..c8863e5 100644
--- a/metropolis/node/core/pstore.go
+++ b/metropolis/node/core/pstore.go
@@ -23,6 +23,13 @@
 			logger.Warning(line)
 		}
 	}
+	userspaceLines, err := pstore.GetPmsgDump()
+	if err != nil {
+		logger.Errorf("Failed to recover userspace logs from pstore: %v", err)
+	}
+	for _, line := range userspaceLines {
+		logger.Warning(line)
+	}
 	cleanErr := pstore.ClearAll()
 	if cleanErr != nil {
 		logger.Errorf("Failed to clear pstore: %v", err)
diff --git a/metropolis/pkg/pstore/pstore.go b/metropolis/pkg/pstore/pstore.go
index b553c3c..1bfe586 100644
--- a/metropolis/pkg/pstore/pstore.go
+++ b/metropolis/pkg/pstore/pstore.go
@@ -73,6 +73,8 @@
 
 var dmesgFileRegexp = regexp.MustCompile("^dmesg-.*-([0-9]+)")
 
+var pmsgFileRegexp = regexp.MustCompile("^pmsg-.*-([0-9]+)")
+
 type pstoreDmesgFile struct {
 	hdr   pstoreDmesgHeader
 	ctime time.Time
@@ -85,6 +87,32 @@
 	return getKmsgDumpsFromFS(os.DirFS(CanonicalMountPath))
 }
 
+// GetPmsgDump returns lines written into /dev/pmsg0
+func GetPmsgDump() ([]string, error) {
+	var lines []string
+	pstoreEntries, err := os.ReadDir(CanonicalMountPath)
+	if err != nil {
+		return []string{}, fmt.Errorf("failed to list files in pstore: %w", err)
+	}
+	for _, entry := range pstoreEntries {
+		if !pmsgFileRegexp.MatchString(entry.Name()) {
+			continue
+		}
+		f, err := os.Open(filepath.Join(CanonicalMountPath, entry.Name()))
+		if err != nil {
+			return lines, fmt.Errorf("failed to open pstore entry file: %w", err)
+		}
+		// This only closes after all files have been read, but the number of
+		// files is heavily bound by very small amounts of pstore space.
+		defer f.Close()
+		s := bufio.NewScanner(f)
+		for s.Scan() {
+			lines = append(lines, s.Text())
+		}
+	}
+	return lines, nil
+}
+
 // f is injected here for testing
 func getKmsgDumpsFromFS(f fs.FS) ([]KmsgDump, error) {
 	var events []KmsgDump
diff --git a/third_party/linux/external.bzl b/third_party/linux/external.bzl
index e862cb1..84ba704 100644
--- a/third_party/linux/external.bzl
+++ b/third_party/linux/external.bzl
@@ -27,6 +27,7 @@
         patches = [
             "//third_party/linux/external:0001-block-partition-expose-PARTUUID-through-uevent.patch",
             "//third_party/linux/external:disable-static-ifs.patch",
+            "//third_party/linux/external:enable-pmsg.patch",
         ],
         sha256 = sums[version],
         strip_prefix = "linux-" + version,
diff --git a/third_party/linux/external/enable-pmsg.patch b/third_party/linux/external/enable-pmsg.patch
new file mode 100644
index 0000000..8886d94
--- /dev/null
+++ b/third_party/linux/external/enable-pmsg.patch
@@ -0,0 +1,34 @@
+This enables pstore for userspace messages for the ACPI ERST as well as the
+EFI variable pstore backend, both of which are expected to be used frequently
+with Metropolis. We need that to reliably store panics of core processes
+regardless of the system state. This is not enabled upstream, either because
+pmsg was a later addition to pstore or for concerns of accidentally running
+out of pstore space. We work around the limited space issue by always clearing
+pstore after every boot, so this poses little extra risk to us.
+
+diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
+index bf65e3461531..fe2c331b4b99 100644
+--- a/drivers/acpi/apei/erst.c
++++ b/drivers/acpi/apei/erst.c
+@@ -1022,7 +1022,7 @@ static int erst_clearer(struct pstore_record *record);
+ static struct pstore_info erst_info = {
+ 	.owner		= THIS_MODULE,
+ 	.name		= "erst",
+-	.flags		= PSTORE_FLAGS_DMESG,
++	.flags		= PSTORE_FLAGS_DMESG | PSTORE_FLAGS_PMSG,
+ 	.open		= erst_open_pstore,
+ 	.close		= erst_close_pstore,
+ 	.read		= erst_reader,
+diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
+index e7b9ec6f8a86..7570499b46e0 100644
+--- a/drivers/firmware/efi/efi-pstore.c
++++ b/drivers/firmware/efi/efi-pstore.c
+@@ -210,7 +210,7 @@ static int efi_pstore_erase(struct pstore_record *record)
+ static struct pstore_info efi_pstore_info = {
+ 	.owner		= THIS_MODULE,
+ 	.name		= KBUILD_MODNAME,
+-	.flags		= PSTORE_FLAGS_DMESG,
++	.flags		= PSTORE_FLAGS_DMESG | PSTORE_FLAGS_PMSG,
+ 	.open		= efi_pstore_open,
+ 	.close		= efi_pstore_close,
+ 	.read		= efi_pstore_read,
diff --git a/third_party/linux/linux-metropolis.config b/third_party/linux/linux-metropolis.config
index b31a141..830a77b 100644
--- a/third_party/linux/linux-metropolis.config
+++ b/third_party/linux/linux-metropolis.config
@@ -3851,7 +3851,7 @@
 CONFIG_PSTORE_DEFAULT_KMSG_BYTES=10240
 CONFIG_PSTORE_COMPRESS=y
 # CONFIG_PSTORE_CONSOLE is not set
-# CONFIG_PSTORE_PMSG is not set
+CONFIG_PSTORE_PMSG=y
 # CONFIG_PSTORE_FTRACE is not set
 # CONFIG_PSTORE_RAM is not set
 # CONFIG_PSTORE_BLK is not set