blob: d9a14bf0a4f5351a6cb1b994d29f96efa1ffb86f [file] [log] [blame]
Lorenz Brunf0ae3312023-09-12 13:40:30 +02001// Package watchdog provides access to hardware watchdogs. These can be used to
2// automatically reset/reboot a system if they are no longer pinged.
3package watchdog
4
5import (
6 "bytes"
7 "errors"
8 "fmt"
9 "math"
10 "os"
11 "syscall"
12 "time"
13
14 "golang.org/x/sys/unix"
15)
16
17// Device represents a handle to a hardware watchdog.
18type Device struct {
19 // Type identifies the type of watchdog device. It corresponds to the Linux
20 // driver's watchdog_info.identity value.
21 Type string
22 // HasConfiguratbleTimeout indicates if the device supports the SetTimeout
23 // call.
24 HasConfigurableTimeout bool
25 // HasPretimeout indicates if the device supports notifying the system of
26 // an impending reset and the functions to control this
27 // (Get/SetPreTimeout).
28 HasPretimeout bool
29 // Indicates if the watchdog is capable of reporting that it is responsible
30 // for the last system reset.
31 ReportsWatchdogReset bool
32
33 raw syscall.RawConn
34 f *os.File
35}
36
37// Open opens a watchdog device identified by the path to its device inode.
38func Open(name string) (*Device, error) {
39 f, err := os.Open(name)
40 if err != nil {
41 // Already wrapped by PathError
42 return nil, err
43 }
44 raw, err := f.SyscallConn()
45 if err != nil {
46 f.Close()
47 return nil, fmt.Errorf("while obtaining RawConn: %w", err)
48 }
49 var wdInfo *unix.WatchdogInfo
50 ctrlErr := raw.Control(func(fd uintptr) {
51 wdInfo, err = unix.IoctlGetWatchdogInfo(int(fd))
52 })
53 if ctrlErr != nil {
54 f.Close()
55 return nil, fmt.Errorf("when calling RawConn.Control: %w", err)
56 }
57 if errors.Is(err, unix.ENOTTY) {
58 f.Close()
59 return nil, errors.New("device is not a watchdog")
60 }
61 if err != nil {
62 return nil, fmt.Errorf("while getting watchdog metadata: %w", err)
63 }
64 w := &Device{
65 Type: string(bytes.Trim(wdInfo.Identity[:], "\x00")),
66 f: f,
67 raw: raw,
68 HasConfigurableTimeout: wdInfo.Options&unix.WDIOF_SETTIMEOUT != 0,
69 HasPretimeout: wdInfo.Options&unix.WDIOF_PRETIMEOUT != 0,
70 ReportsWatchdogReset: wdInfo.Options&unix.WDIOF_CARDRESET != 0,
71 }
72 return w, nil
73}
74
75// SetTimeout sets the duration since the last ping after which it performs
76// a recovery actions (usually a reset or reboot).
77// Due to hardware limitations this function may approximate the set duration
78// or not be a available at all. GetTimeout returns the active timeout.
79func (w *Device) SetTimeout(t time.Duration) error {
80 if !w.HasConfigurableTimeout {
81 return errors.New("watchdog does not have a configurable timeout, check HasConfigurableTimeout")
82 }
83 var err error
84 ctrlErr := w.raw.Control(func(fd uintptr) {
85 err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETTIMEOUT, int(math.Ceil(t.Seconds())))
86 })
87 if ctrlErr != nil {
88 return fmt.Errorf("when calling RawConn.Control: %w", err)
89 }
90 if err != nil {
91 return fmt.Errorf("ioctl(WDIOC_SETTIMEOUT): %w", err)
92 }
93 return nil
94}
95
96// GetTimeout returns the configured timeout duration.
97func (w *Device) GetTimeout() (time.Duration, error) {
98 var err error
99 var t int
100 ctrlErr := w.raw.Control(func(fd uintptr) {
101 t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETTIMEOUT)
102 })
103 if ctrlErr != nil {
104 return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
105 }
106 if err != nil {
107 return 0, fmt.Errorf("ioctl(WDIOC_GETTIMEOUT): %w", err)
108 }
109 return time.Duration(t) * time.Second, nil
110}
111
112// SetPreTimeout sets the minimum duration left on the expiry timer where when
113// it drops below that, the system is notified (via some high-priority
114// interrupt, usually an NMI). This is only available if HasPretimeout is true.
115// This can be used by the system (if it's still in a sem-working state) to
116// recover or dump diagnostic information before it gets forcibly reset by the
117// watchdog. To disable this functionality, set the duration to zero.
118func (w *Device) SetPreTimeout(t time.Duration) error {
119 if !w.HasPretimeout {
120 return errors.New("watchdog does not have a pretimeout, check HasPretimeout")
121 }
122 var err error
123 ctrlErr := w.raw.Control(func(fd uintptr) {
124 err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETPRETIMEOUT, int(math.Ceil(t.Seconds())))
125 })
126 if ctrlErr != nil {
127 return fmt.Errorf("when calling RawConn.Control: %w", err)
128 }
129 if err != nil {
130 return fmt.Errorf("ioctl(WDIOC_SETPRETIMEOUT): %w", err)
131 }
132 return nil
133}
134
135// GetPreTimeout gets the current pre-timeout (see SetPreTimeout for more).
136func (w *Device) GetPreTimeout() (time.Duration, error) {
137 if !w.HasPretimeout {
138 return 0, errors.New("watchdog does not have a pretimeout, check HasPretimeout")
139 }
140 var err error
141 var t int
142 ctrlErr := w.raw.Control(func(fd uintptr) {
143 t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETPRETIMEOUT)
144 })
145 if ctrlErr != nil {
146 return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
147 }
148 if err != nil {
149 return 0, fmt.Errorf("ioctl(WDIOC_GETPRETIMEOUT): %w", err)
150 }
151 return time.Duration(t) * time.Second, nil
152
153}
154
155// Ping the watchdog. This needs to be called regularly before the
156// watchdog timeout expires, otherwise the system resets.
157func (w *Device) Ping() error {
158 var err error
159 ctrlErr := w.raw.Control(func(fd uintptr) {
160 err = unix.IoctlWatchdogKeepalive(int(fd))
161 })
162 if ctrlErr != nil {
163 return fmt.Errorf("when calling RawConn.Control: %w", err)
164 }
165 if err != nil {
166 return fmt.Errorf("ioctl(WDIOC_KEEPALIVE): %w", err)
167 }
168 return nil
169}
170
171// LastResetByWatchdog returns true if the last system reset was caused by
172// this watchdog. Not all watchdogs report this accurately.
173func (w *Device) LastResetByWatchdog() (bool, error) {
174 if !w.ReportsWatchdogReset {
175 return false, errors.New("watchdog does not report resets, check ReportsWatchdogReset")
176 }
177 var err error
178 var flags int
179 ctrlErr := w.raw.Control(func(fd uintptr) {
180 flags, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETBOOTSTATUS)
181 })
182 if ctrlErr != nil {
183 return false, fmt.Errorf("when calling RawConn.Control: %w", err)
184 }
185 if err != nil {
186 return false, fmt.Errorf("ioctl(WDIOC_GETBOOTSTATUS): %w", err)
187 }
188 return flags&unix.WDIOF_CARDRESET != 0, nil
189}
190
191// Close disables the watchdog and releases all associated resources.
192func (w *Device) Close() error {
193 if w.f != nil {
194 _, err := w.f.Write([]byte{'V'})
195 errClose := w.f.Close()
196 w.f = nil
197 if err != nil {
198 return err
199 }
200 return errClose
201 }
202 return nil
203}
204
205// CloseActive releases all resources and file handles, but keeps the
206// watchdog active. Another system must reopen it and ping it before
207// it expires to avoid a reset.
208func (w *Device) CloseActive() error {
209 if w.f != nil {
210 err := w.f.Close()
211 w.f = nil
212 return err
213 }
214 return nil
215}