m/pkg/pstore: add package to interface with pstore
This adds a package for interfacing with the Linux kernel's pstore
(persistent storage) system. Currently only handles kmsg/dmesg-type logs
as mce has an unknown format and I have no examples.
Change-Id: I3089a53cdca224c7e6e04dd51a94035d7b2b880b
Reviewed-on: https://review.monogon.dev/c/monogon/+/769
Reviewed-by: Sergiusz Bazanski <serge@monogon.tech>
Tested-by: Jenkins CI
diff --git a/metropolis/pkg/pstore/BUILD.bazel b/metropolis/pkg/pstore/BUILD.bazel
new file mode 100644
index 0000000..ac84095
--- /dev/null
+++ b/metropolis/pkg/pstore/BUILD.bazel
@@ -0,0 +1,14 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+ name = "pstore",
+ srcs = ["pstore.go"],
+ importpath = "source.monogon.dev/metropolis/pkg/pstore",
+ visibility = ["//visibility:public"],
+)
+
+go_test(
+ name = "pstore_test",
+ srcs = ["pstore_test.go"],
+ embed = [":pstore"],
+)
diff --git a/metropolis/pkg/pstore/pstore.go b/metropolis/pkg/pstore/pstore.go
new file mode 100644
index 0000000..b553c3c
--- /dev/null
+++ b/metropolis/pkg/pstore/pstore.go
@@ -0,0 +1,166 @@
+// The pstore package provides functions for interfacing with the Linux kernel's
+// pstore (persistent storage) system.
+// Documentation for pstore itself can be found at
+// https://docs.kernel.org/admin-guide/abi-testing.html#abi-sys-fs-pstore.
+package pstore
+
+import (
+ "bufio"
+ "errors"
+ "fmt"
+ "io/fs"
+ "os"
+ "path/filepath"
+ "regexp"
+ "sort"
+ "strconv"
+ "time"
+)
+
+// CanonicalMountPath contains the canonical mount path of the pstore filesystem
+const CanonicalMountPath = "/sys/fs/pstore"
+
+// pstoreDmesgHeader contains parsed header data from a pstore header.
+type pstoreDmesgHeader struct {
+ Reason string
+ Counter uint64
+ Part uint64
+}
+
+var headerRegexp = regexp.MustCompile("^([^#]+)#([0-9]+) Part([0-9]+)$")
+
+// parseDmesgHeader parses textual pstore entry headers as assembled by
+// @linux//fs/pstore/platform.c:pstore_dump back into a structured format.
+// The input must be the first line of a file with the terminating \n stripped.
+func parseDmesgHeader(hdr string) (*pstoreDmesgHeader, error) {
+ parts := headerRegexp.FindStringSubmatch(hdr)
+ if parts == nil {
+ return nil, errors.New("unable to parse pstore entry header")
+ }
+ counter, err := strconv.ParseUint(parts[2], 10, 64)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse pstore header count: %w", err)
+ }
+ part, err := strconv.ParseUint(parts[3], 10, 64)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse pstore header part: %w", err)
+ }
+ return &pstoreDmesgHeader{
+ Reason: parts[1],
+ Counter: counter,
+ Part: part,
+ }, nil
+}
+
+// A reassembled kernel message buffer dump from pstore.
+type KmsgDump struct {
+ // The reason why the dump was created. Common values include "Panic" and
+ // "Oops", but depending on the setting `printk.always_kmsg_dump` and
+ // potential future reasons this is likely unbounded.
+ Reason string
+ // The CLOCK_REALTIME value of the first entry in the dump (which is the
+ // closest to the actual time the dump happened). This can be zero or
+ // garbage if the RTC hasn't been initialized or the system has no working
+ // clock source.
+ OccurredAt time.Time
+ // A counter counting up for every dump created. Can be used to order dumps
+ // when the OccurredAt value is not usable due to system issues.
+ Counter uint64
+ // A list of kernel log lines in oldest-to-newest order, i.e. the oldest
+ // message comes first. The actual cause is generally reported last.
+ Lines []string
+}
+
+var dmesgFileRegexp = regexp.MustCompile("^dmesg-.*-([0-9]+)")
+
+type pstoreDmesgFile struct {
+ hdr pstoreDmesgHeader
+ ctime time.Time
+ lines []string
+}
+
+// GetKmsgDumps returns a list of events where the kernel has dumped its kmsg
+// (kernel log) buffer into pstore because of a kernel oops or panic.
+func GetKmsgDumps() ([]KmsgDump, error) {
+ return getKmsgDumpsFromFS(os.DirFS(CanonicalMountPath))
+}
+
+// f is injected here for testing
+func getKmsgDumpsFromFS(f fs.FS) ([]KmsgDump, error) {
+ var events []KmsgDump
+ eventMap := make(map[string][]pstoreDmesgFile)
+ pstoreEntries, err := fs.ReadDir(f, ".")
+ if err != nil {
+ return events, fmt.Errorf("failed to list files in pstore: %w", err)
+ }
+ for _, entry := range pstoreEntries {
+ if !dmesgFileRegexp.MatchString(entry.Name()) {
+ continue
+ }
+ f, err := f.Open(entry.Name())
+ if err != nil {
+ return events, fmt.Errorf("failed to open pstore entry file: %w", err)
+ }
+ // This only closes after all files have been read, but the number of
+ // files is heavily bound by very small amounts of pstore space.
+ defer f.Close()
+ finfo, err := f.Stat()
+ if err != nil {
+ return events, fmt.Errorf("failed to stat pstore entry file: %w", err)
+ }
+ s := bufio.NewScanner(f)
+ if !s.Scan() {
+ return events, fmt.Errorf("cannot read first line header of pstore entry %q: %w", entry.Name(), s.Err())
+ }
+ hdr, err := parseDmesgHeader(s.Text())
+ if err != nil {
+ return events, fmt.Errorf("failed to parse header of file %q: %w", entry.Name(), err)
+ }
+ var lines []string
+ for s.Scan() {
+ lines = append(lines, s.Text())
+ }
+ // Same textual encoding is used in the header itself, so this
+ // is as unique as it gets.
+ key := fmt.Sprintf("%v#%d", hdr.Reason, hdr.Counter)
+ eventMap[key] = append(eventMap[key], pstoreDmesgFile{hdr: *hdr, ctime: finfo.ModTime(), lines: lines})
+ }
+
+ for _, event := range eventMap {
+ sort.Slice(event, func(i, j int) bool {
+ return event[i].hdr.Part > event[j].hdr.Part
+ })
+ ev := KmsgDump{
+ Counter: event[len(event)-1].hdr.Counter,
+ Reason: event[len(event)-1].hdr.Reason,
+ // Entries get created in reverse order, so the most accurate
+ // timestamp is the first one.
+ OccurredAt: event[len(event)-1].ctime,
+ }
+ for _, entry := range event {
+ ev.Lines = append(ev.Lines, entry.lines...)
+ }
+ events = append(events, ev)
+ }
+ sort.Slice(events, func(i, j int) bool {
+ return !events[i].OccurredAt.Before(events[j].OccurredAt)
+ })
+ return events, nil
+}
+
+// ClearAll clears out all existing entries from the pstore. This should be done
+// after every start (after the relevant data has been read out) to ensure that
+// there is always space to store new pstore entries and to minimize the risk
+// of breaking badly-programmed firmware.
+func ClearAll() error {
+ pstoreEntries, err := os.ReadDir(CanonicalMountPath)
+ if err != nil {
+ return fmt.Errorf("failed to list files in pstore: %w", err)
+ }
+ for _, entry := range pstoreEntries {
+ if err := os.Remove(filepath.Join(CanonicalMountPath, entry.Name())); err != nil {
+ return fmt.Errorf("failed to clear pstore entry: %w", err)
+ }
+ }
+ return nil
+}
diff --git a/metropolis/pkg/pstore/pstore_test.go b/metropolis/pkg/pstore/pstore_test.go
new file mode 100644
index 0000000..0190f04
--- /dev/null
+++ b/metropolis/pkg/pstore/pstore_test.go
@@ -0,0 +1,144 @@
+package pstore
+
+import (
+ "fmt"
+ "testing"
+ "testing/fstest"
+ "time"
+)
+
+func TestParseHeader(t *testing.T) {
+ var cases = []struct {
+ input string
+ expectedOut *pstoreDmesgHeader
+ }{
+ {"Panic#2 Part30", &pstoreDmesgHeader{"Panic", 2, 30}},
+ {"Oops#1 Part5", &pstoreDmesgHeader{"Oops", 1, 5}},
+ // Random kernel output that is similar, but definitely not a dump header
+ {"<4>[2501503.489317] Oops: 0010 [#1] SMP NOPTI", nil},
+ }
+ for i, c := range cases {
+ t.Run(fmt.Sprintf("Test#%d", i+1), func(t *testing.T) {
+ out, err := parseDmesgHeader(c.input)
+ switch {
+ case err != nil && c.expectedOut != nil:
+ t.Errorf("Failed parsing %q: %v", c.input, err)
+ case err == nil && c.expectedOut == nil:
+ t.Errorf("Successfully parsed %q, expected error", c.input)
+ case err != nil && c.expectedOut == nil:
+ case err == nil && c.expectedOut != nil:
+ if out.Part != c.expectedOut.Part {
+ t.Errorf("Expected part to be %d, got %d", c.expectedOut.Part, out.Part)
+ }
+ if out.Counter != c.expectedOut.Counter {
+ t.Errorf("Expected counter to be %d, got %d", c.expectedOut.Counter, out.Counter)
+ }
+ if out.Reason != c.expectedOut.Reason {
+ t.Errorf("Expected reason to be %q, got %q", c.expectedOut.Reason, out.Reason)
+ }
+ }
+ })
+ }
+}
+
+func TestGetKmsgDumps(t *testing.T) {
+ testTime1 := time.Date(2022, 06, 13, 1, 2, 3, 4, time.UTC)
+ testTime2 := time.Date(2020, 06, 13, 1, 2, 3, 4, time.UTC)
+ testTime3 := time.Date(2010, 06, 13, 1, 2, 3, 4, time.UTC)
+ cases := []struct {
+ name string
+ inputFS fstest.MapFS
+ expectErr bool
+ expectedDumps []KmsgDump
+ }{
+ {"EmptyPstore", map[string]*fstest.MapFile{}, false, []KmsgDump{}},
+ {"SingleDumpSingleFile", map[string]*fstest.MapFile{
+ "dmesg-efi-165467917816002": {ModTime: testTime1, Data: []byte("Panic#2 Part1\ntest1\ntest2")},
+ "yolo-efi-165467917816002": {ModTime: testTime1, Data: []byte("something totally unrelated")},
+ }, false, []KmsgDump{{
+ Reason: "Panic",
+ OccurredAt: testTime1,
+ Counter: 2,
+ Lines: []string{
+ "test1",
+ "test2",
+ },
+ }}},
+ {"SingleDumpMultipleFiles", map[string]*fstest.MapFile{
+ "dmesg-efi-165467917816002": {ModTime: testTime1, Data: []byte("Panic#2 Part1\ntest2\ntest3")},
+ "dmesg-efi-165467917817002": {ModTime: testTime2, Data: []byte("Panic#2 Part2\ntest1")},
+ }, false, []KmsgDump{{
+ Reason: "Panic",
+ OccurredAt: testTime1,
+ Counter: 2,
+ Lines: []string{
+ "test1",
+ "test2",
+ "test3",
+ },
+ }}},
+ {"MultipleDumpsMultipleFiles", map[string]*fstest.MapFile{
+ "dmesg-efi-165467917816002": {ModTime: testTime1, Data: []byte("Panic#2 Part1\ntest2\ntest3")},
+ "dmesg-efi-165467917817002": {ModTime: testTime2, Data: []byte("Panic#2 Part2\ntest1")},
+ "dmesg-efi-265467917816002": {ModTime: testTime3, Data: []byte("Oops#1 Part1\noops3")},
+ "dmesg-efi-265467917817002": {ModTime: testTime2, Data: []byte("Oops#1 Part2\noops1\noops2")},
+ }, false, []KmsgDump{{
+ Reason: "Panic",
+ OccurredAt: testTime1,
+ Counter: 2,
+ Lines: []string{
+ "test1",
+ "test2",
+ "test3",
+ },
+ }, {
+ Reason: "Oops",
+ OccurredAt: testTime3,
+ Counter: 1,
+ Lines: []string{
+ "oops1",
+ "oops2",
+ "oops3",
+ },
+ }}},
+ }
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ dumps, err := getKmsgDumpsFromFS(c.inputFS)
+ switch {
+ case err == nil && c.expectErr:
+ t.Error("Expected error, but got none")
+ return
+ case err != nil && !c.expectErr:
+ t.Errorf("Got unexpected error: %v", err)
+ return
+ case err != nil && c.expectErr:
+ // Got expected error
+ return
+ case err == nil && !c.expectErr:
+ if len(dumps) != len(c.expectedDumps) {
+ t.Fatalf("Expected %d dumps, got %d", len(c.expectedDumps), len(dumps))
+ }
+ for i, dump := range dumps {
+ if dump.OccurredAt != c.expectedDumps[i].OccurredAt {
+ t.Errorf("Dump %d expected to have occurred at %v, got %v", i, c.expectedDumps[i].OccurredAt, dump.OccurredAt)
+ }
+ if dump.Reason != c.expectedDumps[i].Reason {
+ t.Errorf("Expected reason in dump %d to be %v, got %v", i, c.expectedDumps[i].Reason, dump.Reason)
+ }
+ if dump.Counter != c.expectedDumps[i].Counter {
+ t.Errorf("Expected counter in dump %d to be %d, got %d", i, c.expectedDumps[i].Counter, dump.Counter)
+ }
+ if len(dump.Lines) != len(c.expectedDumps[i].Lines) {
+ t.Errorf("Expected number of lines in dump %d to be %d, got %d", i, len(c.expectedDumps[i].Lines), len(dump.Lines))
+ }
+ for j := range dump.Lines {
+ if dump.Lines[j] != c.expectedDumps[i].Lines[j] {
+ t.Errorf("Expected line %d in dump %d to be %q, got %q", i, j, c.expectedDumps[i].Lines[j], dump.Lines[j])
+ }
+ }
+ }
+ }
+ })
+ }
+}