blob: 5321c0daf41828eb1b44a8496e8cec75c9900aa5 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Lorenz Brunf0ae3312023-09-12 13:40:30 +02004// Package watchdog provides access to hardware watchdogs. These can be used to
5// automatically reset/reboot a system if they are no longer pinged.
6package watchdog
7
8import (
9 "bytes"
10 "errors"
11 "fmt"
12 "math"
13 "os"
14 "syscall"
15 "time"
16
17 "golang.org/x/sys/unix"
18)
19
20// Device represents a handle to a hardware watchdog.
21type Device struct {
22 // Type identifies the type of watchdog device. It corresponds to the Linux
23 // driver's watchdog_info.identity value.
24 Type string
25 // HasConfiguratbleTimeout indicates if the device supports the SetTimeout
26 // call.
27 HasConfigurableTimeout bool
28 // HasPretimeout indicates if the device supports notifying the system of
29 // an impending reset and the functions to control this
30 // (Get/SetPreTimeout).
31 HasPretimeout bool
32 // Indicates if the watchdog is capable of reporting that it is responsible
33 // for the last system reset.
34 ReportsWatchdogReset bool
35
36 raw syscall.RawConn
37 f *os.File
38}
39
40// Open opens a watchdog device identified by the path to its device inode.
41func Open(name string) (*Device, error) {
42 f, err := os.Open(name)
43 if err != nil {
44 // Already wrapped by PathError
45 return nil, err
46 }
47 raw, err := f.SyscallConn()
48 if err != nil {
49 f.Close()
50 return nil, fmt.Errorf("while obtaining RawConn: %w", err)
51 }
52 var wdInfo *unix.WatchdogInfo
53 ctrlErr := raw.Control(func(fd uintptr) {
54 wdInfo, err = unix.IoctlGetWatchdogInfo(int(fd))
55 })
56 if ctrlErr != nil {
57 f.Close()
58 return nil, fmt.Errorf("when calling RawConn.Control: %w", err)
59 }
60 if errors.Is(err, unix.ENOTTY) {
61 f.Close()
62 return nil, errors.New("device is not a watchdog")
63 }
64 if err != nil {
65 return nil, fmt.Errorf("while getting watchdog metadata: %w", err)
66 }
67 w := &Device{
68 Type: string(bytes.Trim(wdInfo.Identity[:], "\x00")),
69 f: f,
70 raw: raw,
71 HasConfigurableTimeout: wdInfo.Options&unix.WDIOF_SETTIMEOUT != 0,
72 HasPretimeout: wdInfo.Options&unix.WDIOF_PRETIMEOUT != 0,
73 ReportsWatchdogReset: wdInfo.Options&unix.WDIOF_CARDRESET != 0,
74 }
75 return w, nil
76}
77
78// SetTimeout sets the duration since the last ping after which it performs
79// a recovery actions (usually a reset or reboot).
80// Due to hardware limitations this function may approximate the set duration
81// or not be a available at all. GetTimeout returns the active timeout.
82func (w *Device) SetTimeout(t time.Duration) error {
83 if !w.HasConfigurableTimeout {
84 return errors.New("watchdog does not have a configurable timeout, check HasConfigurableTimeout")
85 }
86 var err error
87 ctrlErr := w.raw.Control(func(fd uintptr) {
88 err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETTIMEOUT, int(math.Ceil(t.Seconds())))
89 })
90 if ctrlErr != nil {
91 return fmt.Errorf("when calling RawConn.Control: %w", err)
92 }
93 if err != nil {
94 return fmt.Errorf("ioctl(WDIOC_SETTIMEOUT): %w", err)
95 }
96 return nil
97}
98
99// GetTimeout returns the configured timeout duration.
100func (w *Device) GetTimeout() (time.Duration, error) {
101 var err error
102 var t int
103 ctrlErr := w.raw.Control(func(fd uintptr) {
104 t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETTIMEOUT)
105 })
106 if ctrlErr != nil {
107 return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
108 }
109 if err != nil {
110 return 0, fmt.Errorf("ioctl(WDIOC_GETTIMEOUT): %w", err)
111 }
112 return time.Duration(t) * time.Second, nil
113}
114
115// SetPreTimeout sets the minimum duration left on the expiry timer where when
116// it drops below that, the system is notified (via some high-priority
117// interrupt, usually an NMI). This is only available if HasPretimeout is true.
118// This can be used by the system (if it's still in a sem-working state) to
119// recover or dump diagnostic information before it gets forcibly reset by the
120// watchdog. To disable this functionality, set the duration to zero.
121func (w *Device) SetPreTimeout(t time.Duration) error {
122 if !w.HasPretimeout {
123 return errors.New("watchdog does not have a pretimeout, check HasPretimeout")
124 }
125 var err error
126 ctrlErr := w.raw.Control(func(fd uintptr) {
127 err = unix.IoctlSetInt(int(fd), unix.WDIOC_SETPRETIMEOUT, int(math.Ceil(t.Seconds())))
128 })
129 if ctrlErr != nil {
130 return fmt.Errorf("when calling RawConn.Control: %w", err)
131 }
132 if err != nil {
133 return fmt.Errorf("ioctl(WDIOC_SETPRETIMEOUT): %w", err)
134 }
135 return nil
136}
137
138// GetPreTimeout gets the current pre-timeout (see SetPreTimeout for more).
139func (w *Device) GetPreTimeout() (time.Duration, error) {
140 if !w.HasPretimeout {
141 return 0, errors.New("watchdog does not have a pretimeout, check HasPretimeout")
142 }
143 var err error
144 var t int
145 ctrlErr := w.raw.Control(func(fd uintptr) {
146 t, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETPRETIMEOUT)
147 })
148 if ctrlErr != nil {
149 return 0, fmt.Errorf("when calling RawConn.Control: %w", err)
150 }
151 if err != nil {
152 return 0, fmt.Errorf("ioctl(WDIOC_GETPRETIMEOUT): %w", err)
153 }
154 return time.Duration(t) * time.Second, nil
155
156}
157
158// Ping the watchdog. This needs to be called regularly before the
159// watchdog timeout expires, otherwise the system resets.
160func (w *Device) Ping() error {
161 var err error
162 ctrlErr := w.raw.Control(func(fd uintptr) {
163 err = unix.IoctlWatchdogKeepalive(int(fd))
164 })
165 if ctrlErr != nil {
166 return fmt.Errorf("when calling RawConn.Control: %w", err)
167 }
168 if err != nil {
169 return fmt.Errorf("ioctl(WDIOC_KEEPALIVE): %w", err)
170 }
171 return nil
172}
173
174// LastResetByWatchdog returns true if the last system reset was caused by
175// this watchdog. Not all watchdogs report this accurately.
176func (w *Device) LastResetByWatchdog() (bool, error) {
177 if !w.ReportsWatchdogReset {
178 return false, errors.New("watchdog does not report resets, check ReportsWatchdogReset")
179 }
180 var err error
181 var flags int
182 ctrlErr := w.raw.Control(func(fd uintptr) {
183 flags, err = unix.IoctlGetInt(int(fd), unix.WDIOC_GETBOOTSTATUS)
184 })
185 if ctrlErr != nil {
186 return false, fmt.Errorf("when calling RawConn.Control: %w", err)
187 }
188 if err != nil {
189 return false, fmt.Errorf("ioctl(WDIOC_GETBOOTSTATUS): %w", err)
190 }
191 return flags&unix.WDIOF_CARDRESET != 0, nil
192}
193
194// Close disables the watchdog and releases all associated resources.
195func (w *Device) Close() error {
196 if w.f != nil {
197 _, err := w.f.Write([]byte{'V'})
198 errClose := w.f.Close()
199 w.f = nil
200 if err != nil {
201 return err
202 }
203 return errClose
204 }
205 return nil
206}
207
208// CloseActive releases all resources and file handles, but keeps the
209// watchdog active. Another system must reopen it and ping it before
210// it expires to avoid a reset.
211func (w *Device) CloseActive() error {
212 if w.f != nil {
213 err := w.f.Close()
214 w.f = nil
215 return err
216 }
217 return nil
218}