treewide: introduce osbase package and move things around
All except localregistry moved from metropolis/pkg to osbase,
localregistry moved to metropolis/test as its only used there anyway.
Change-Id: If1a4bf377364bef0ac23169e1b90379c71b06d72
Reviewed-on: https://review.monogon.dev/c/monogon/+/3079
Tested-by: Jenkins CI
Reviewed-by: Serge Bazanski <serge@monogon.tech>
diff --git a/osbase/watchdog/BUILD.bazel b/osbase/watchdog/BUILD.bazel
new file mode 100644
index 0000000..66879b8
--- /dev/null
+++ b/osbase/watchdog/BUILD.bazel
@@ -0,0 +1,9 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+ name = "watchdog",
+ srcs = ["watchdog.go"],
+ importpath = "source.monogon.dev/osbase/watchdog",
+ visibility = ["//visibility:public"],
+ deps = ["@org_golang_x_sys//unix"],
+)
diff --git a/osbase/watchdog/watchdog.go b/osbase/watchdog/watchdog.go
new file mode 100644
index 0000000..d9a14bf
--- /dev/null
+++ b/osbase/watchdog/watchdog.go
@@ -0,0 +1,215 @@
+// Package watchdog provides access to hardware watchdogs. These can be used to
+// automatically reset/reboot a system if they are no longer pinged.
+package watchdog
+
+import (
+ "bytes"
+ "errors"
+ "fmt"
+ "math"
+ "os"
+ "syscall"
+ "time"
+
+ "golang.org/x/sys/unix"
+)
+
+// Device represents a handle to a hardware watchdog.
+type Device struct {
+ // Type identifies the type of watchdog device. It corresponds to the Linux
+ // driver's watchdog_info.identity value.
+ Type string
+ // HasConfiguratbleTimeout indicates if the device supports the SetTimeout
+ // call.
+ HasConfigurableTimeout bool
+ // HasPretimeout indicates if the device supports notifying the system of
+ // an impending reset and the functions to control this
+ // (Get/SetPreTimeout).
+ HasPretimeout bool
+ // Indicates if the watchdog is capable of reporting that it is responsible
+ // for the last system reset.
+ ReportsWatchdogReset bool
+
+ raw syscall.RawConn
+ f *os.File
+}
+
+// Open opens a watchdog device identified by the path to its device inode.
+func Open(name string) (*Device, error) {
+ f, err := os.Open(name)
+ if err != nil {
+ // Already wrapped by PathError
+ return nil, err
+ }
+ raw, err := f.SyscallConn()
+ if err != nil {
+ f.Close()
+ return nil, fmt.Errorf("while obtaining RawConn: %w", err)
+ }
+ var wdInfo *unix.WatchdogInfo
+ ctrlErr := raw.Control(func(fd uintptr) {
+ wdInfo, err = unix.IoctlGetWatchdogInfo(int(fd))
+ })
+ if ctrlErr != nil {
+ f.Close()
+ return nil, fmt.Errorf("when calling RawConn.Control: %w", err)
+ }
+ if errors.Is(err, unix.ENOTTY) {
+ f.Close()
+ return nil, errors.New("device is not a watchdog")
+ }
+ if err != nil {
+ return nil, fmt.Errorf("while getting watchdog metadata: %w", err)
+ }
+ w := &Device{
+ Type: string(bytes.Trim(wdInfo.Identity[:], "\x00")),
+ f: f,
+ raw: raw,
+ HasConfigurableTimeout: wdInfo.Options&unix.WDIOF_SETTIMEOUT != 0,
+ HasPretimeout: wdInfo.Options&unix.WDIOF_PRETIMEOUT != 0,
+ ReportsWatchdogReset: wdInfo.Options&unix.WDIOF_CARDRESET != 0,
+ }
+ return w, nil
+}
+
+// SetTimeout sets the duration since the last ping after which it performs
+// a recovery actions (usually a reset or reboot).
+// Due to hardware limitations this function may approximate the set duration
+// or not be a available at all. GetTimeout returns the active timeout.
+func (w *Device) SetTimeout(t time.Duration) error {
+ if !w.HasConfigurableTimeout {
+ return errors.New("watchdog does not have a configurable timeout, check HasConfigurableTimeout")
+ }
+ var err error
+ ctrlErr := w.raw.Control(func(fd uintptr) {
+ err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETTIMEOUT, int(math.Ceil(t.Seconds())))
+ })
+ if ctrlErr != nil {
+ return fmt.Errorf("when calling RawConn.Control: %w", err)
+ }
+ if err != nil {
+ return fmt.Errorf("ioctl(WDIOC_SETTIMEOUT): %w", err)
+ }
+ return nil
+}
+
+// GetTimeout returns the configured timeout duration.
+func (w *Device) GetTimeout() (time.Duration, error) {
+ var err error
+ var t int
+ ctrlErr := w.raw.Control(func(fd uintptr) {
+ t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETTIMEOUT)
+ })
+ if ctrlErr != nil {
+ return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
+ }
+ if err != nil {
+ return 0, fmt.Errorf("ioctl(WDIOC_GETTIMEOUT): %w", err)
+ }
+ return time.Duration(t) * time.Second, nil
+}
+
+// SetPreTimeout sets the minimum duration left on the expiry timer where when
+// it drops below that, the system is notified (via some high-priority
+// interrupt, usually an NMI). This is only available if HasPretimeout is true.
+// This can be used by the system (if it's still in a sem-working state) to
+// recover or dump diagnostic information before it gets forcibly reset by the
+// watchdog. To disable this functionality, set the duration to zero.
+func (w *Device) SetPreTimeout(t time.Duration) error {
+ if !w.HasPretimeout {
+ return errors.New("watchdog does not have a pretimeout, check HasPretimeout")
+ }
+ var err error
+ ctrlErr := w.raw.Control(func(fd uintptr) {
+ err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETPRETIMEOUT, int(math.Ceil(t.Seconds())))
+ })
+ if ctrlErr != nil {
+ return fmt.Errorf("when calling RawConn.Control: %w", err)
+ }
+ if err != nil {
+ return fmt.Errorf("ioctl(WDIOC_SETPRETIMEOUT): %w", err)
+ }
+ return nil
+}
+
+// GetPreTimeout gets the current pre-timeout (see SetPreTimeout for more).
+func (w *Device) GetPreTimeout() (time.Duration, error) {
+ if !w.HasPretimeout {
+ return 0, errors.New("watchdog does not have a pretimeout, check HasPretimeout")
+ }
+ var err error
+ var t int
+ ctrlErr := w.raw.Control(func(fd uintptr) {
+ t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETPRETIMEOUT)
+ })
+ if ctrlErr != nil {
+ return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
+ }
+ if err != nil {
+ return 0, fmt.Errorf("ioctl(WDIOC_GETPRETIMEOUT): %w", err)
+ }
+ return time.Duration(t) * time.Second, nil
+
+}
+
+// Ping the watchdog. This needs to be called regularly before the
+// watchdog timeout expires, otherwise the system resets.
+func (w *Device) Ping() error {
+ var err error
+ ctrlErr := w.raw.Control(func(fd uintptr) {
+ err = unix.IoctlWatchdogKeepalive(int(fd))
+ })
+ if ctrlErr != nil {
+ return fmt.Errorf("when calling RawConn.Control: %w", err)
+ }
+ if err != nil {
+ return fmt.Errorf("ioctl(WDIOC_KEEPALIVE): %w", err)
+ }
+ return nil
+}
+
+// LastResetByWatchdog returns true if the last system reset was caused by
+// this watchdog. Not all watchdogs report this accurately.
+func (w *Device) LastResetByWatchdog() (bool, error) {
+ if !w.ReportsWatchdogReset {
+ return false, errors.New("watchdog does not report resets, check ReportsWatchdogReset")
+ }
+ var err error
+ var flags int
+ ctrlErr := w.raw.Control(func(fd uintptr) {
+ flags, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETBOOTSTATUS)
+ })
+ if ctrlErr != nil {
+ return false, fmt.Errorf("when calling RawConn.Control: %w", err)
+ }
+ if err != nil {
+ return false, fmt.Errorf("ioctl(WDIOC_GETBOOTSTATUS): %w", err)
+ }
+ return flags&unix.WDIOF_CARDRESET != 0, nil
+}
+
+// Close disables the watchdog and releases all associated resources.
+func (w *Device) Close() error {
+ if w.f != nil {
+ _, err := w.f.Write([]byte{'V'})
+ errClose := w.f.Close()
+ w.f = nil
+ if err != nil {
+ return err
+ }
+ return errClose
+ }
+ return nil
+}
+
+// CloseActive releases all resources and file handles, but keeps the
+// watchdog active. Another system must reopen it and ping it before
+// it expires to avoid a reset.
+func (w *Device) CloseActive() error {
+ if w.f != nil {
+ err := w.f.Close()
+ w.f = nil
+ return err
+ }
+ return nil
+}