m/n/core: add pstore handling

Adds a one-shot runnable which dumps all kmsg dumps to the system log
and then clears the pstore. This makes sure that there is always space
for new pstore entries and gives administrators the option of reading
crash logs without booting another operating system. It also helps some
broken EFI firmware to not fail to boot.

Change-Id: Icbf30c0a0898e0e660910a80637d544f022a97cd
Reviewed-on: https://review.monogon.dev/c/monogon/+/770
Reviewed-by: Sergiusz Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/node/core/BUILD.bazel b/metropolis/node/core/BUILD.bazel
index 6d2c9c3..f62982f 100644
--- a/metropolis/node/core/BUILD.bazel
+++ b/metropolis/node/core/BUILD.bazel
@@ -6,6 +6,7 @@
     srcs = [
         "main.go",
         "mounts.go",
+        "pstore.go",
     ] + select({
         "//metropolis/node:debug_build": [
             "debug_service_enabled.go",
@@ -28,6 +29,7 @@
         "//metropolis/node/core/roleserve",
         "//metropolis/node/core/time",
         "//metropolis/pkg/logtree",
+        "//metropolis/pkg/pstore",
         "//metropolis/pkg/supervisor",
         "//metropolis/pkg/tpm",
         "//metropolis/proto/api",
diff --git a/metropolis/node/core/main.go b/metropolis/node/core/main.go
index 0af18e9..eb674ae 100644
--- a/metropolis/node/core/main.go
+++ b/metropolis/node/core/main.go
@@ -132,6 +132,9 @@
 		if err := supervisor.Run(ctx, "time", timeSvc.Run); err != nil {
 			return fmt.Errorf("when starting time: %w", err)
 		}
+		if err := supervisor.Run(ctx, "pstore", dumpAndCleanPstore); err != nil {
+			return fmt.Errorf("when starting pstore: %w", err)
+		}
 
 		// Start the role service. The role service connects to the curator and runs
 		// all node-specific role code (eg. Kubernetes services).
diff --git a/metropolis/node/core/mounts.go b/metropolis/node/core/mounts.go
index 797d892..cb8351d 100644
--- a/metropolis/node/core/mounts.go
+++ b/metropolis/node/core/mounts.go
@@ -37,6 +37,7 @@
 		{"/sys", "sysfs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/sys/kernel/tracing", "tracefs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/sys/firmware/efi/efivars", "efivarfs", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
+		{"/sys/fs/pstore", "pstore", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/proc", "proc", unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV},
 		{"/dev", "devtmpfs", unix.MS_NOEXEC | unix.MS_NOSUID},
 		{"/dev/pts", "devpts", unix.MS_NOEXEC | unix.MS_NOSUID},
diff --git a/metropolis/node/core/pstore.go b/metropolis/node/core/pstore.go
new file mode 100644
index 0000000..01f6cfd
--- /dev/null
+++ b/metropolis/node/core/pstore.go
@@ -0,0 +1,35 @@
+package main
+
+import (
+	"context"
+
+	"source.monogon.dev/metropolis/pkg/pstore"
+	"source.monogon.dev/metropolis/pkg/supervisor"
+)
+
+// dumpAndCleanPstore dumps all files accumulated in the pstore into the log
+// and clears them from the pstore. This allows looking at these logs and also
+// keeps the pstore from overflowing the generally limited storage it has.
+func dumpAndCleanPstore(ctx context.Context) error {
+	logger := supervisor.Logger(ctx)
+	// Retrying this is extremely unlikely to result in any change and is most
+	// likely just going to generate large amounts of useless logs obscuring
+	// errors.
+	supervisor.Signal(ctx, supervisor.SignalDone)
+	dumps, err := pstore.GetKmsgDumps()
+	if err != nil {
+		logger.Errorf("Failed to recover logs from pstore: %v", err)
+		return nil
+	}
+	for _, dump := range dumps {
+		logger.Errorf("Recovered log from %v at %v. Reconstructed log follows.", dump.Reason, dump.OccurredAt)
+		for _, line := range dump.Lines {
+			logger.Warning(line)
+		}
+	}
+	cleanErr := pstore.ClearAll()
+	if cleanErr != nil {
+		logger.Errorf("Failed to clear pstore: %v", err)
+	}
+	return nil
+}