m/pkg/pstore: add package to interface with pstore
This adds a package for interfacing with the Linux kernel's pstore
(persistent storage) system. Currently only handles kmsg/dmesg-type logs
as mce has an unknown format and I have no examples.
Change-Id: I3089a53cdca224c7e6e04dd51a94035d7b2b880b
Reviewed-on: https://review.monogon.dev/c/monogon/+/769
Reviewed-by: Sergiusz Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/pkg/pstore/BUILD.bazel b/metropolis/pkg/pstore/BUILD.bazel
new file mode 100644
index 0000000..ac84095
--- /dev/null
+++ b/metropolis/pkg/pstore/BUILD.bazel
@@ -0,0 +1,14 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "pstore",
+    srcs = ["pstore.go"],
+    importpath = "source.monogon.dev/metropolis/pkg/pstore",
+    visibility = ["//visibility:public"],
+)
+
+go_test(
+    name = "pstore_test",
+    srcs = ["pstore_test.go"],
+    embed = [":pstore"],
+)
diff --git a/metropolis/pkg/pstore/pstore.go b/metropolis/pkg/pstore/pstore.go
new file mode 100644
index 0000000..b553c3c
--- /dev/null
+++ b/metropolis/pkg/pstore/pstore.go
@@ -0,0 +1,166 @@
+// The pstore package provides functions for interfacing with the Linux kernel's
+// pstore (persistent storage) system.
+// Documentation for pstore itself can be found at
+// https://docs.kernel.org/admin-guide/abi-testing.html#abi-sys-fs-pstore.
+package pstore
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strconv"
+	"time"
+)
+
+// CanonicalMountPath contains the canonical mount path of the pstore filesystem
+const CanonicalMountPath = "/sys/fs/pstore"
+
+// pstoreDmesgHeader contains parsed header data from a pstore header.
+type pstoreDmesgHeader struct {
+	Reason  string
+	Counter uint64
+	Part    uint64
+}
+
+var headerRegexp = regexp.MustCompile("^([^#]+)#([0-9]+) Part([0-9]+)$")
+
+// parseDmesgHeader parses textual pstore entry headers as assembled by
+// @linux//fs/pstore/platform.c:pstore_dump back into a structured format.
+// The input must be the first line of a file with the terminating \n stripped.
+func parseDmesgHeader(hdr string) (*pstoreDmesgHeader, error) {
+	parts := headerRegexp.FindStringSubmatch(hdr)
+	if parts == nil {
+		return nil, errors.New("unable to parse pstore entry header")
+	}
+	counter, err := strconv.ParseUint(parts[2], 10, 64)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse pstore header count: %w", err)
+	}
+	part, err := strconv.ParseUint(parts[3], 10, 64)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse pstore header part: %w", err)
+	}
+	return &pstoreDmesgHeader{
+		Reason:  parts[1],
+		Counter: counter,
+		Part:    part,
+	}, nil
+}
+
+// A reassembled kernel message buffer dump from pstore.
+type KmsgDump struct {
+	// The reason why the dump was created. Common values include "Panic" and
+	// "Oops", but depending on the setting `printk.always_kmsg_dump` and
+	// potential future reasons this is likely unbounded.
+	Reason string
+	// The CLOCK_REALTIME value of the first entry in the dump (which is the
+	// closest to the actual time the dump happened). This can be zero or
+	// garbage if the RTC hasn't been initialized or the system has no working
+	// clock source.
+	OccurredAt time.Time
+	// A counter counting up for every dump created. Can be used to order dumps
+	// when the OccurredAt value is not usable due to system issues.
+	Counter uint64
+	// A list of kernel log lines in oldest-to-newest order, i.e. the oldest
+	// message comes first. The actual cause is generally reported last.
+	Lines []string
+}
+
+var dmesgFileRegexp = regexp.MustCompile("^dmesg-.*-([0-9]+)")
+
+type pstoreDmesgFile struct {
+	hdr   pstoreDmesgHeader
+	ctime time.Time
+	lines []string
+}
+
+// GetKmsgDumps returns a list of events where the kernel has dumped its kmsg
+// (kernel log) buffer into pstore because of a kernel oops or panic.
+func GetKmsgDumps() ([]KmsgDump, error) {
+	return getKmsgDumpsFromFS(os.DirFS(CanonicalMountPath))
+}
+
+// f is injected here for testing
+func getKmsgDumpsFromFS(f fs.FS) ([]KmsgDump, error) {
+	var events []KmsgDump
+	eventMap := make(map[string][]pstoreDmesgFile)
+	pstoreEntries, err := fs.ReadDir(f, ".")
+	if err != nil {
+		return events, fmt.Errorf("failed to list files in pstore: %w", err)
+	}
+	for _, entry := range pstoreEntries {
+		if !dmesgFileRegexp.MatchString(entry.Name()) {
+			continue
+		}
+		f, err := f.Open(entry.Name())
+		if err != nil {
+			return events, fmt.Errorf("failed to open pstore entry file: %w", err)
+		}
+		// This only closes after all files have been read, but the number of
+		// files is heavily bound by very small amounts of pstore space.
+		defer f.Close()
+		finfo, err := f.Stat()
+		if err != nil {
+			return events, fmt.Errorf("failed to stat pstore entry file: %w", err)
+		}
+		s := bufio.NewScanner(f)
+		if !s.Scan() {
+			return events, fmt.Errorf("cannot read first line header of pstore entry %q: %w", entry.Name(), s.Err())
+		}
+		hdr, err := parseDmesgHeader(s.Text())
+		if err != nil {
+			return events, fmt.Errorf("failed to parse header of file %q: %w", entry.Name(), err)
+		}
+		var lines []string
+		for s.Scan() {
+			lines = append(lines, s.Text())
+		}
+		// Same textual encoding is used in the header itself, so this
+		// is as unique as it gets.
+		key := fmt.Sprintf("%v#%d", hdr.Reason, hdr.Counter)
+		eventMap[key] = append(eventMap[key], pstoreDmesgFile{hdr: *hdr, ctime: finfo.ModTime(), lines: lines})
+	}
+
+	for _, event := range eventMap {
+		sort.Slice(event, func(i, j int) bool {
+			return event[i].hdr.Part > event[j].hdr.Part
+		})
+		ev := KmsgDump{
+			Counter: event[len(event)-1].hdr.Counter,
+			Reason:  event[len(event)-1].hdr.Reason,
+			// Entries get created in reverse order, so the most accurate
+			// timestamp is the first one.
+			OccurredAt: event[len(event)-1].ctime,
+		}
+		for _, entry := range event {
+			ev.Lines = append(ev.Lines, entry.lines...)
+		}
+		events = append(events, ev)
+	}
+	sort.Slice(events, func(i, j int) bool {
+		return !events[i].OccurredAt.Before(events[j].OccurredAt)
+	})
+	return events, nil
+}
+
+// ClearAll clears out all existing entries from the pstore. This should be done
+// after every start (after the relevant data has been read out) to ensure that
+// there is always space to store new pstore entries and to minimize the risk
+// of breaking badly-programmed firmware.
+func ClearAll() error {
+	pstoreEntries, err := os.ReadDir(CanonicalMountPath)
+	if err != nil {
+		return fmt.Errorf("failed to list files in pstore: %w", err)
+	}
+	for _, entry := range pstoreEntries {
+		if err := os.Remove(filepath.Join(CanonicalMountPath, entry.Name())); err != nil {
+			return fmt.Errorf("failed to clear pstore entry: %w", err)
+		}
+	}
+	return nil
+}
diff --git a/metropolis/pkg/pstore/pstore_test.go b/metropolis/pkg/pstore/pstore_test.go
new file mode 100644
index 0000000..0190f04
--- /dev/null
+++ b/metropolis/pkg/pstore/pstore_test.go
@@ -0,0 +1,144 @@
+package pstore
+
+import (
+	"fmt"
+	"testing"
+	"testing/fstest"
+	"time"
+)
+
+func TestParseHeader(t *testing.T) {
+	var cases = []struct {
+		input       string
+		expectedOut *pstoreDmesgHeader
+	}{
+		{"Panic#2 Part30", &pstoreDmesgHeader{"Panic", 2, 30}},
+		{"Oops#1 Part5", &pstoreDmesgHeader{"Oops", 1, 5}},
+		// Random kernel output that is similar, but definitely not a dump header
+		{"<4>[2501503.489317] Oops: 0010 [#1] SMP NOPTI", nil},
+	}
+	for i, c := range cases {
+		t.Run(fmt.Sprintf("Test#%d", i+1), func(t *testing.T) {
+			out, err := parseDmesgHeader(c.input)
+			switch {
+			case err != nil && c.expectedOut != nil:
+				t.Errorf("Failed parsing %q: %v", c.input, err)
+			case err == nil && c.expectedOut == nil:
+				t.Errorf("Successfully parsed %q, expected error", c.input)
+			case err != nil && c.expectedOut == nil:
+			case err == nil && c.expectedOut != nil:
+				if out.Part != c.expectedOut.Part {
+					t.Errorf("Expected part to be %d, got %d", c.expectedOut.Part, out.Part)
+				}
+				if out.Counter != c.expectedOut.Counter {
+					t.Errorf("Expected counter to be %d, got %d", c.expectedOut.Counter, out.Counter)
+				}
+				if out.Reason != c.expectedOut.Reason {
+					t.Errorf("Expected reason to be %q, got %q", c.expectedOut.Reason, out.Reason)
+				}
+			}
+		})
+	}
+}
+
+func TestGetKmsgDumps(t *testing.T) {
+	testTime1 := time.Date(2022, 06, 13, 1, 2, 3, 4, time.UTC)
+	testTime2 := time.Date(2020, 06, 13, 1, 2, 3, 4, time.UTC)
+	testTime3 := time.Date(2010, 06, 13, 1, 2, 3, 4, time.UTC)
+	cases := []struct {
+		name          string
+		inputFS       fstest.MapFS
+		expectErr     bool
+		expectedDumps []KmsgDump
+	}{
+		{"EmptyPstore", map[string]*fstest.MapFile{}, false, []KmsgDump{}},
+		{"SingleDumpSingleFile", map[string]*fstest.MapFile{
+			"dmesg-efi-165467917816002": {ModTime: testTime1, Data: []byte("Panic#2 Part1\ntest1\ntest2")},
+			"yolo-efi-165467917816002":  {ModTime: testTime1, Data: []byte("something totally unrelated")},
+		}, false, []KmsgDump{{
+			Reason:     "Panic",
+			OccurredAt: testTime1,
+			Counter:    2,
+			Lines: []string{
+				"test1",
+				"test2",
+			},
+		}}},
+		{"SingleDumpMultipleFiles", map[string]*fstest.MapFile{
+			"dmesg-efi-165467917816002": {ModTime: testTime1, Data: []byte("Panic#2 Part1\ntest2\ntest3")},
+			"dmesg-efi-165467917817002": {ModTime: testTime2, Data: []byte("Panic#2 Part2\ntest1")},
+		}, false, []KmsgDump{{
+			Reason:     "Panic",
+			OccurredAt: testTime1,
+			Counter:    2,
+			Lines: []string{
+				"test1",
+				"test2",
+				"test3",
+			},
+		}}},
+		{"MultipleDumpsMultipleFiles", map[string]*fstest.MapFile{
+			"dmesg-efi-165467917816002": {ModTime: testTime1, Data: []byte("Panic#2 Part1\ntest2\ntest3")},
+			"dmesg-efi-165467917817002": {ModTime: testTime2, Data: []byte("Panic#2 Part2\ntest1")},
+			"dmesg-efi-265467917816002": {ModTime: testTime3, Data: []byte("Oops#1 Part1\noops3")},
+			"dmesg-efi-265467917817002": {ModTime: testTime2, Data: []byte("Oops#1 Part2\noops1\noops2")},
+		}, false, []KmsgDump{{
+			Reason:     "Panic",
+			OccurredAt: testTime1,
+			Counter:    2,
+			Lines: []string{
+				"test1",
+				"test2",
+				"test3",
+			},
+		}, {
+			Reason:     "Oops",
+			OccurredAt: testTime3,
+			Counter:    1,
+			Lines: []string{
+				"oops1",
+				"oops2",
+				"oops3",
+			},
+		}}},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			dumps, err := getKmsgDumpsFromFS(c.inputFS)
+			switch {
+			case err == nil && c.expectErr:
+				t.Error("Expected error, but got none")
+				return
+			case err != nil && !c.expectErr:
+				t.Errorf("Got unexpected error: %v", err)
+				return
+			case err != nil && c.expectErr:
+				// Got expected error
+				return
+			case err == nil && !c.expectErr:
+				if len(dumps) != len(c.expectedDumps) {
+					t.Fatalf("Expected %d dumps, got %d", len(c.expectedDumps), len(dumps))
+				}
+				for i, dump := range dumps {
+					if dump.OccurredAt != c.expectedDumps[i].OccurredAt {
+						t.Errorf("Dump %d expected to have occurred at %v, got %v", i, c.expectedDumps[i].OccurredAt, dump.OccurredAt)
+					}
+					if dump.Reason != c.expectedDumps[i].Reason {
+						t.Errorf("Expected reason in dump %d to be %v, got %v", i, c.expectedDumps[i].Reason, dump.Reason)
+					}
+					if dump.Counter != c.expectedDumps[i].Counter {
+						t.Errorf("Expected counter in dump %d to be %d, got %d", i, c.expectedDumps[i].Counter, dump.Counter)
+					}
+					if len(dump.Lines) != len(c.expectedDumps[i].Lines) {
+						t.Errorf("Expected number of lines in dump %d to be %d, got %d", i, len(c.expectedDumps[i].Lines), len(dump.Lines))
+					}
+					for j := range dump.Lines {
+						if dump.Lines[j] != c.expectedDumps[i].Lines[j] {
+							t.Errorf("Expected line %d in dump %d to be %q, got %q", i, j, c.expectedDumps[i].Lines[j], dump.Lines[j])
+						}
+					}
+				}
+			}
+		})
+	}
+}