blob: 929c307a40455d1e1094840fe438d1299e5f1ce9 [file] [log] [blame]
Tim Windelschmidt6d33a432025-02-04 14:34:25 +01001// Copyright The Monogon Project Authors.
2// SPDX-License-Identifier: Apache-2.0
3
Lorenz Brunfba5da02022-12-15 11:20:47 +00004package nvme
5
6import (
7 "bytes"
8 "encoding/binary"
9 "fmt"
10 "math/big"
11 "time"
12)
13
14// healthPage represents the raw data from a NVMe Health/SMART page.
15// See Figure 93 in the spec.
16type healthPage struct {
17 CriticalWarning uint8
18 CompositeTemperature uint16
19 AvailableSpare uint8
20 AvailableSpareThreshold uint8
21 PercentageUsed uint8
22
23 _ [26]byte
24
25 DataUnitsRead uint128le
26 DataUnitsWritten uint128le
27 HostReadCommands uint128le
28 HostWriteCommands uint128le
29 ControllerBusyTime uint128le
30 PowerCycles uint128le
31 PowerOnHours uint128le
32 UnsafeSHutdowns uint128le
33 MediaAndDataIntegrityErrors uint128le
34 ErrorInformationLogEntries uint128le
35
36 WarningCompositeTemperatureTime uint32
37 CriticalCompositeTemperatureTime uint32
38
39 TemperatureSensors [8]uint16
40
41 ThermalMgmtTemperature1TransitionCount uint32
42 ThermalMgmtTemperature2TransitionCount uint32
43
44 _ [8]byte
45
46 TotalTimeForThermalMgmtTemperature1 uint32
47 TotalTimeForThermalMgmtTemperature2 uint32
48}
49
50// HealthInfo contains information related to the health of the NVMe device.
51//
52// Note that some values might be clamped under highly abnormal circumstances
53// as they are reported as 128-bit integers which Go doesn't support.
54// For easier handling values which are very unlikely to exceed 64 bits are
55// exposed as 64 bit integers.
56type HealthInfo struct {
57 // AvailableSpareSpaceCritical is set if the avilable spare threshold has
58 // fallen below the critical threshold.
59 AvailableSpareSpaceCritical bool
60 // TemperatureCritical is set if a temperature is outside the acceptable
61 // operating thresholds.
62 TemperatureCritical bool
63 // MediaCritical is set if significant media or internal issues affect the
64 // operation of the device.
65 MediaCritical bool
66 // ForcedReadOnly is set if the device is forced into read-only mode due
67 // to an error.
68 ForcedReadOnly bool
69 // VolatileMemoryBackupFailed is set if the volatile memory backup device
70 // has failed.
71 VolatileMemoryBackupFailed bool
72 // CompositeTemperatureKelvin contains a derived value representing the
73 // composite state of controller and namespace/flash temperature.
74 // The exact mechanism used to derive it is vendor-specific.
75 CompositeTemperatureKelvin uint16
76 // AvailableSpare represents the relative amount (0-1) of spare capacity
77 // still unnused.
78 AvailableSpare float32
79 // AvailableSpareThreshold represents the vendor-defined threshold which
80 // AvailableSpare shuld not fall under.
81 AvailableSpareThreshold float32
82 // LifeUsed represents vendor-defined relative estimate of the life of
83 // the device which has been used up. It is allowed to exceed 1 and will
84 // be clamped by the device somewhere between 1.0 and 2.55.
85 LifeUsed float32
86 // BytesRead contains the number of bytes read from the device.
87 // This value is only updated in 512KiB increments.
88 BytesRead *big.Int
89 // BytesWritten contains the number of bytes written to the device.
90 // This value is only updated in 512KiB increments.
91 BytesWritten *big.Int
92 // HostReadCommands contains the number of read commands completed by the
93 // controller.
94 HostReadCommands *big.Int
95 // HostWriteCommands contains the number of write commands completed by the
96 // controller.
97 HostWriteCommands *big.Int
98 // ControllerBusyTime contains the cumulative amount of time the controller
99 // has spent being busy (i.e. having at least one command outstanding on an
100 // I/O queue). This value is only updated in 1m increments.
101 ControllerBusyTime time.Duration
102 // PowerCycles contains the number of power cycles.
103 PowerCycles uint64
104 // PowerOnHours contains the number of hours the controller has been
105 // powered on. Depending on the vendor implementation it may or may
106 // not contain time spent in a non-operational power state.
107 PowerOnHours uint64
108 // UnsafeShutdown contains the number of power loss events without
109 // a prior shutdown notification from the host.
110 UnsafeShutdowns uint64
111 // MediaAndDataIntegrityErrors contains the number of occurrences where the
112 // controller detecte an unrecovered data integrity error.
113 MediaAndDataIntegrityErrors uint64
114 // ErrorInformationLogEntriesCount contains the number of Error
115 // Information log entries over the life of the controller.
116 ErrorInformationLogEntriesCount uint64
117 // WarningCompositeTemperatureTime contains the amount of time the
118 // controller is operational while the composite temperature is greater
119 // than the warning composite threshold.
120 WarningCompositeTemperatureTime time.Duration
121 // CriticalCompositeTemperatureTime contains the amount of time the
122 // controller is operational while the composite temperature is greater
123 // than the critical composite threshold.
124 CriticalCompositeTemperatureTime time.Duration
125 // TemperatureSensorValues contains the current temperature in Kelvin as
126 // reported by up to 8 sensors on the device. A value of zero means that
127 // the given sensor is not available.
128 TemperatureSensorValues [8]uint16
129 // ThermalMgmtTemperature1TransitionCount contains the number of times the
130 // controller transitioned to lower power active power states or performed
131 // vendor-specific thermal management actions to reduce temperature.
132 ThermalMgmtTemperature1TransitionCount uint32
133 // ThermalMgmtTemperature2TransitionCount is the same as above, but
134 // for "heavier" thermal management actions including heavy throttling.
135 // The actual difference is vendor-specific.
136 ThermalMgmtTemperature2TransitionCount uint32
137 // TotalTimeForThermalMgmtTemperature1 contains the total time the
138 // controller spent under "light" thermal management.
139 TotalTimeForThermalMgmtTemperature1 time.Duration
140 // TotalTimeForThermalMgmtTemperature2 contains the total time the
141 // controller spent under "heavy" thermal management.
142 TotalTimeForThermalMgmtTemperature2 time.Duration
143}
144
145// HasCriticalWarning returns true if any of the critical warnings
146// (AvailableSpareSpaceCritical, TemperatureCritical, MediaCritical,
147// ForcedReadOnly, VolatileMemoryBackupFailed) are active.
148// If this returns true the NVMe medium has reason to believe that
149// data availability or integrity is endangered.
150func (h *HealthInfo) HasCriticalWarning() bool {
151 return h.AvailableSpareSpaceCritical || h.TemperatureCritical || h.MediaCritical || h.ForcedReadOnly || h.VolatileMemoryBackupFailed
152}
153
154// See Figure 93 Data Units Read
155var dataUnit = big.NewInt(512 * 1000)
156
157const (
158 healthLogPage = 0x02
159)
160
161// GetHealthInfo gets health information from the NVMe device's health log page.
162func (d *Device) GetHealthInfo() (*HealthInfo, error) {
163 var buf [512]byte
164
165 if err := d.GetLogPage(GlobalNamespace, healthLogPage, 0, 0, buf[:]); err != nil {
166 return nil, fmt.Errorf("unable to get health log page: %w", err)
167 }
168
169 var page healthPage
170 binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page)
171 var res HealthInfo
172 res.AvailableSpareSpaceCritical = page.CriticalWarning&(1<<0) != 0
173 res.TemperatureCritical = page.CriticalWarning&(1<<1) != 0
174 res.MediaCritical = page.CriticalWarning&(1<<2) != 0
175 res.ForcedReadOnly = page.CriticalWarning&(1<<3) != 0
176 res.VolatileMemoryBackupFailed = page.CriticalWarning&(1<<4) != 0
177 res.CompositeTemperatureKelvin = page.CompositeTemperature
178 res.AvailableSpare = float32(page.AvailableSpare) / 100.
179 res.AvailableSpareThreshold = float32(page.AvailableSpareThreshold) / 100.
180 res.LifeUsed = float32(page.PercentageUsed) / 100.
181 res.BytesRead = new(big.Int).Mul(page.DataUnitsRead.BigInt(), dataUnit)
182 res.BytesWritten = new(big.Int).Mul(page.DataUnitsWritten.BigInt(), dataUnit)
183 res.HostReadCommands = page.HostReadCommands.BigInt()
184 res.HostWriteCommands = page.HostWriteCommands.BigInt()
185 res.ControllerBusyTime = time.Duration(page.ControllerBusyTime.Uint64()) * time.Minute
186 res.PowerCycles = page.PowerCycles.Uint64()
187 res.PowerOnHours = page.PowerOnHours.Uint64()
188 res.UnsafeShutdowns = page.UnsafeSHutdowns.Uint64()
189 res.MediaAndDataIntegrityErrors = page.MediaAndDataIntegrityErrors.Uint64()
190 res.ErrorInformationLogEntriesCount = page.ErrorInformationLogEntries.Uint64()
191 res.WarningCompositeTemperatureTime = time.Duration(page.WarningCompositeTemperatureTime) * time.Minute
192 res.CriticalCompositeTemperatureTime = time.Duration(page.CriticalCompositeTemperatureTime) * time.Minute
193 res.TemperatureSensorValues = page.TemperatureSensors
194 res.ThermalMgmtTemperature1TransitionCount = page.ThermalMgmtTemperature1TransitionCount
195 res.ThermalMgmtTemperature2TransitionCount = page.ThermalMgmtTemperature2TransitionCount
196 res.TotalTimeForThermalMgmtTemperature1 = time.Duration(page.TotalTimeForThermalMgmtTemperature1) * time.Second
197 res.TotalTimeForThermalMgmtTemperature2 = time.Duration(page.TotalTimeForThermalMgmtTemperature2) * time.Second
198 return &res, nil
199}