blob: 775742ff09c2ac59495cd3ca5675a844a26d79d3 [file] [log] [blame]
Lorenz Brunfba5da02022-12-15 11:20:47 +00001package nvme
2
3import (
4 "bytes"
5 "encoding/binary"
6 "fmt"
7 "math/big"
8 "time"
9)
10
11// healthPage represents the raw data from a NVMe Health/SMART page.
12// See Figure 93 in the spec.
13type healthPage struct {
14 CriticalWarning uint8
15 CompositeTemperature uint16
16 AvailableSpare uint8
17 AvailableSpareThreshold uint8
18 PercentageUsed uint8
19
20 _ [26]byte
21
22 DataUnitsRead uint128le
23 DataUnitsWritten uint128le
24 HostReadCommands uint128le
25 HostWriteCommands uint128le
26 ControllerBusyTime uint128le
27 PowerCycles uint128le
28 PowerOnHours uint128le
29 UnsafeSHutdowns uint128le
30 MediaAndDataIntegrityErrors uint128le
31 ErrorInformationLogEntries uint128le
32
33 WarningCompositeTemperatureTime uint32
34 CriticalCompositeTemperatureTime uint32
35
36 TemperatureSensors [8]uint16
37
38 ThermalMgmtTemperature1TransitionCount uint32
39 ThermalMgmtTemperature2TransitionCount uint32
40
41 _ [8]byte
42
43 TotalTimeForThermalMgmtTemperature1 uint32
44 TotalTimeForThermalMgmtTemperature2 uint32
45}
46
47// HealthInfo contains information related to the health of the NVMe device.
48//
49// Note that some values might be clamped under highly abnormal circumstances
50// as they are reported as 128-bit integers which Go doesn't support.
51// For easier handling values which are very unlikely to exceed 64 bits are
52// exposed as 64 bit integers.
53type HealthInfo struct {
54 // AvailableSpareSpaceCritical is set if the avilable spare threshold has
55 // fallen below the critical threshold.
56 AvailableSpareSpaceCritical bool
57 // TemperatureCritical is set if a temperature is outside the acceptable
58 // operating thresholds.
59 TemperatureCritical bool
60 // MediaCritical is set if significant media or internal issues affect the
61 // operation of the device.
62 MediaCritical bool
63 // ForcedReadOnly is set if the device is forced into read-only mode due
64 // to an error.
65 ForcedReadOnly bool
66 // VolatileMemoryBackupFailed is set if the volatile memory backup device
67 // has failed.
68 VolatileMemoryBackupFailed bool
69 // CompositeTemperatureKelvin contains a derived value representing the
70 // composite state of controller and namespace/flash temperature.
71 // The exact mechanism used to derive it is vendor-specific.
72 CompositeTemperatureKelvin uint16
73 // AvailableSpare represents the relative amount (0-1) of spare capacity
74 // still unnused.
75 AvailableSpare float32
76 // AvailableSpareThreshold represents the vendor-defined threshold which
77 // AvailableSpare shuld not fall under.
78 AvailableSpareThreshold float32
79 // LifeUsed represents vendor-defined relative estimate of the life of
80 // the device which has been used up. It is allowed to exceed 1 and will
81 // be clamped by the device somewhere between 1.0 and 2.55.
82 LifeUsed float32
83 // BytesRead contains the number of bytes read from the device.
84 // This value is only updated in 512KiB increments.
85 BytesRead *big.Int
86 // BytesWritten contains the number of bytes written to the device.
87 // This value is only updated in 512KiB increments.
88 BytesWritten *big.Int
89 // HostReadCommands contains the number of read commands completed by the
90 // controller.
91 HostReadCommands *big.Int
92 // HostWriteCommands contains the number of write commands completed by the
93 // controller.
94 HostWriteCommands *big.Int
95 // ControllerBusyTime contains the cumulative amount of time the controller
96 // has spent being busy (i.e. having at least one command outstanding on an
97 // I/O queue). This value is only updated in 1m increments.
98 ControllerBusyTime time.Duration
99 // PowerCycles contains the number of power cycles.
100 PowerCycles uint64
101 // PowerOnHours contains the number of hours the controller has been
102 // powered on. Depending on the vendor implementation it may or may
103 // not contain time spent in a non-operational power state.
104 PowerOnHours uint64
105 // UnsafeShutdown contains the number of power loss events without
106 // a prior shutdown notification from the host.
107 UnsafeShutdowns uint64
108 // MediaAndDataIntegrityErrors contains the number of occurrences where the
109 // controller detecte an unrecovered data integrity error.
110 MediaAndDataIntegrityErrors uint64
111 // ErrorInformationLogEntriesCount contains the number of Error
112 // Information log entries over the life of the controller.
113 ErrorInformationLogEntriesCount uint64
114 // WarningCompositeTemperatureTime contains the amount of time the
115 // controller is operational while the composite temperature is greater
116 // than the warning composite threshold.
117 WarningCompositeTemperatureTime time.Duration
118 // CriticalCompositeTemperatureTime contains the amount of time the
119 // controller is operational while the composite temperature is greater
120 // than the critical composite threshold.
121 CriticalCompositeTemperatureTime time.Duration
122 // TemperatureSensorValues contains the current temperature in Kelvin as
123 // reported by up to 8 sensors on the device. A value of zero means that
124 // the given sensor is not available.
125 TemperatureSensorValues [8]uint16
126 // ThermalMgmtTemperature1TransitionCount contains the number of times the
127 // controller transitioned to lower power active power states or performed
128 // vendor-specific thermal management actions to reduce temperature.
129 ThermalMgmtTemperature1TransitionCount uint32
130 // ThermalMgmtTemperature2TransitionCount is the same as above, but
131 // for "heavier" thermal management actions including heavy throttling.
132 // The actual difference is vendor-specific.
133 ThermalMgmtTemperature2TransitionCount uint32
134 // TotalTimeForThermalMgmtTemperature1 contains the total time the
135 // controller spent under "light" thermal management.
136 TotalTimeForThermalMgmtTemperature1 time.Duration
137 // TotalTimeForThermalMgmtTemperature2 contains the total time the
138 // controller spent under "heavy" thermal management.
139 TotalTimeForThermalMgmtTemperature2 time.Duration
140}
141
142// HasCriticalWarning returns true if any of the critical warnings
143// (AvailableSpareSpaceCritical, TemperatureCritical, MediaCritical,
144// ForcedReadOnly, VolatileMemoryBackupFailed) are active.
145// If this returns true the NVMe medium has reason to believe that
146// data availability or integrity is endangered.
147func (h *HealthInfo) HasCriticalWarning() bool {
148 return h.AvailableSpareSpaceCritical || h.TemperatureCritical || h.MediaCritical || h.ForcedReadOnly || h.VolatileMemoryBackupFailed
149}
150
151// See Figure 93 Data Units Read
152var dataUnit = big.NewInt(512 * 1000)
153
154const (
155 healthLogPage = 0x02
156)
157
158// GetHealthInfo gets health information from the NVMe device's health log page.
159func (d *Device) GetHealthInfo() (*HealthInfo, error) {
160 var buf [512]byte
161
162 if err := d.GetLogPage(GlobalNamespace, healthLogPage, 0, 0, buf[:]); err != nil {
163 return nil, fmt.Errorf("unable to get health log page: %w", err)
164 }
165
166 var page healthPage
167 binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page)
168 var res HealthInfo
169 res.AvailableSpareSpaceCritical = page.CriticalWarning&(1<<0) != 0
170 res.TemperatureCritical = page.CriticalWarning&(1<<1) != 0
171 res.MediaCritical = page.CriticalWarning&(1<<2) != 0
172 res.ForcedReadOnly = page.CriticalWarning&(1<<3) != 0
173 res.VolatileMemoryBackupFailed = page.CriticalWarning&(1<<4) != 0
174 res.CompositeTemperatureKelvin = page.CompositeTemperature
175 res.AvailableSpare = float32(page.AvailableSpare) / 100.
176 res.AvailableSpareThreshold = float32(page.AvailableSpareThreshold) / 100.
177 res.LifeUsed = float32(page.PercentageUsed) / 100.
178 res.BytesRead = new(big.Int).Mul(page.DataUnitsRead.BigInt(), dataUnit)
179 res.BytesWritten = new(big.Int).Mul(page.DataUnitsWritten.BigInt(), dataUnit)
180 res.HostReadCommands = page.HostReadCommands.BigInt()
181 res.HostWriteCommands = page.HostWriteCommands.BigInt()
182 res.ControllerBusyTime = time.Duration(page.ControllerBusyTime.Uint64()) * time.Minute
183 res.PowerCycles = page.PowerCycles.Uint64()
184 res.PowerOnHours = page.PowerOnHours.Uint64()
185 res.UnsafeShutdowns = page.UnsafeSHutdowns.Uint64()
186 res.MediaAndDataIntegrityErrors = page.MediaAndDataIntegrityErrors.Uint64()
187 res.ErrorInformationLogEntriesCount = page.ErrorInformationLogEntries.Uint64()
188 res.WarningCompositeTemperatureTime = time.Duration(page.WarningCompositeTemperatureTime) * time.Minute
189 res.CriticalCompositeTemperatureTime = time.Duration(page.CriticalCompositeTemperatureTime) * time.Minute
190 res.TemperatureSensorValues = page.TemperatureSensors
191 res.ThermalMgmtTemperature1TransitionCount = page.ThermalMgmtTemperature1TransitionCount
192 res.ThermalMgmtTemperature2TransitionCount = page.ThermalMgmtTemperature2TransitionCount
193 res.TotalTimeForThermalMgmtTemperature1 = time.Duration(page.TotalTimeForThermalMgmtTemperature1) * time.Second
194 res.TotalTimeForThermalMgmtTemperature2 = time.Duration(page.TotalTimeForThermalMgmtTemperature2) * time.Second
195 return &res, nil
196}