blob: d9a14bf0a4f5351a6cb1b994d29f96efa1ffb86f [file] [log] [blame] [edit]
// Package watchdog provides access to hardware watchdogs. These can be used to
// automatically reset/reboot a system if they are no longer pinged.
package watchdog
import (
"bytes"
"errors"
"fmt"
"math"
"os"
"syscall"
"time"
"golang.org/x/sys/unix"
)
// Device represents a handle to a hardware watchdog.
type Device struct {
// Type identifies the type of watchdog device. It corresponds to the Linux
// driver's watchdog_info.identity value.
Type string
// HasConfiguratbleTimeout indicates if the device supports the SetTimeout
// call.
HasConfigurableTimeout bool
// HasPretimeout indicates if the device supports notifying the system of
// an impending reset and the functions to control this
// (Get/SetPreTimeout).
HasPretimeout bool
// Indicates if the watchdog is capable of reporting that it is responsible
// for the last system reset.
ReportsWatchdogReset bool
raw syscall.RawConn
f *os.File
}
// Open opens a watchdog device identified by the path to its device inode.
func Open(name string) (*Device, error) {
f, err := os.Open(name)
if err != nil {
// Already wrapped by PathError
return nil, err
}
raw, err := f.SyscallConn()
if err != nil {
f.Close()
return nil, fmt.Errorf("while obtaining RawConn: %w", err)
}
var wdInfo *unix.WatchdogInfo
ctrlErr := raw.Control(func(fd uintptr) {
wdInfo, err = unix.IoctlGetWatchdogInfo(int(fd))
})
if ctrlErr != nil {
f.Close()
return nil, fmt.Errorf("when calling RawConn.Control: %w", err)
}
if errors.Is(err, unix.ENOTTY) {
f.Close()
return nil, errors.New("device is not a watchdog")
}
if err != nil {
return nil, fmt.Errorf("while getting watchdog metadata: %w", err)
}
w := &Device{
Type: string(bytes.Trim(wdInfo.Identity[:], "\x00")),
f: f,
raw: raw,
HasConfigurableTimeout: wdInfo.Options&unix.WDIOF_SETTIMEOUT != 0,
HasPretimeout: wdInfo.Options&unix.WDIOF_PRETIMEOUT != 0,
ReportsWatchdogReset: wdInfo.Options&unix.WDIOF_CARDRESET != 0,
}
return w, nil
}
// SetTimeout sets the duration since the last ping after which it performs
// a recovery actions (usually a reset or reboot).
// Due to hardware limitations this function may approximate the set duration
// or not be a available at all. GetTimeout returns the active timeout.
func (w *Device) SetTimeout(t time.Duration) error {
if !w.HasConfigurableTimeout {
return errors.New("watchdog does not have a configurable timeout, check HasConfigurableTimeout")
}
var err error
ctrlErr := w.raw.Control(func(fd uintptr) {
err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETTIMEOUT, int(math.Ceil(t.Seconds())))
})
if ctrlErr != nil {
return fmt.Errorf("when calling RawConn.Control: %w", err)
}
if err != nil {
return fmt.Errorf("ioctl(WDIOC_SETTIMEOUT): %w", err)
}
return nil
}
// GetTimeout returns the configured timeout duration.
func (w *Device) GetTimeout() (time.Duration, error) {
var err error
var t int
ctrlErr := w.raw.Control(func(fd uintptr) {
t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETTIMEOUT)
})
if ctrlErr != nil {
return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
}
if err != nil {
return 0, fmt.Errorf("ioctl(WDIOC_GETTIMEOUT): %w", err)
}
return time.Duration(t) * time.Second, nil
}
// SetPreTimeout sets the minimum duration left on the expiry timer where when
// it drops below that, the system is notified (via some high-priority
// interrupt, usually an NMI). This is only available if HasPretimeout is true.
// This can be used by the system (if it's still in a sem-working state) to
// recover or dump diagnostic information before it gets forcibly reset by the
// watchdog. To disable this functionality, set the duration to zero.
func (w *Device) SetPreTimeout(t time.Duration) error {
if !w.HasPretimeout {
return errors.New("watchdog does not have a pretimeout, check HasPretimeout")
}
var err error
ctrlErr := w.raw.Control(func(fd uintptr) {
err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETPRETIMEOUT, int(math.Ceil(t.Seconds())))
})
if ctrlErr != nil {
return fmt.Errorf("when calling RawConn.Control: %w", err)
}
if err != nil {
return fmt.Errorf("ioctl(WDIOC_SETPRETIMEOUT): %w", err)
}
return nil
}
// GetPreTimeout gets the current pre-timeout (see SetPreTimeout for more).
func (w *Device) GetPreTimeout() (time.Duration, error) {
if !w.HasPretimeout {
return 0, errors.New("watchdog does not have a pretimeout, check HasPretimeout")
}
var err error
var t int
ctrlErr := w.raw.Control(func(fd uintptr) {
t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETPRETIMEOUT)
})
if ctrlErr != nil {
return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
}
if err != nil {
return 0, fmt.Errorf("ioctl(WDIOC_GETPRETIMEOUT): %w", err)
}
return time.Duration(t) * time.Second, nil
}
// Ping the watchdog. This needs to be called regularly before the
// watchdog timeout expires, otherwise the system resets.
func (w *Device) Ping() error {
var err error
ctrlErr := w.raw.Control(func(fd uintptr) {
err = unix.IoctlWatchdogKeepalive(int(fd))
})
if ctrlErr != nil {
return fmt.Errorf("when calling RawConn.Control: %w", err)
}
if err != nil {
return fmt.Errorf("ioctl(WDIOC_KEEPALIVE): %w", err)
}
return nil
}
// LastResetByWatchdog returns true if the last system reset was caused by
// this watchdog. Not all watchdogs report this accurately.
func (w *Device) LastResetByWatchdog() (bool, error) {
if !w.ReportsWatchdogReset {
return false, errors.New("watchdog does not report resets, check ReportsWatchdogReset")
}
var err error
var flags int
ctrlErr := w.raw.Control(func(fd uintptr) {
flags, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETBOOTSTATUS)
})
if ctrlErr != nil {
return false, fmt.Errorf("when calling RawConn.Control: %w", err)
}
if err != nil {
return false, fmt.Errorf("ioctl(WDIOC_GETBOOTSTATUS): %w", err)
}
return flags&unix.WDIOF_CARDRESET != 0, nil
}
// Close disables the watchdog and releases all associated resources.
func (w *Device) Close() error {
if w.f != nil {
_, err := w.f.Write([]byte{'V'})
errClose := w.f.Close()
w.f = nil
if err != nil {
return err
}
return errClose
}
return nil
}
// CloseActive releases all resources and file handles, but keeps the
// watchdog active. Another system must reopen it and ping it before
// it expires to avoid a reset.
func (w *Device) CloseActive() error {
if w.f != nil {
err := w.f.Close()
w.f = nil
return err
}
return nil
}