Lorenz Brun | fba5da0 | 2022-12-15 11:20:47 +0000 | [diff] [blame] | 1 | package nvme |
| 2 | |
| 3 | import ( |
| 4 | "bytes" |
| 5 | "encoding/binary" |
| 6 | "fmt" |
| 7 | "math/big" |
| 8 | "time" |
| 9 | ) |
| 10 | |
| 11 | // healthPage represents the raw data from a NVMe Health/SMART page. |
| 12 | // See Figure 93 in the spec. |
| 13 | type healthPage struct { |
| 14 | CriticalWarning uint8 |
| 15 | CompositeTemperature uint16 |
| 16 | AvailableSpare uint8 |
| 17 | AvailableSpareThreshold uint8 |
| 18 | PercentageUsed uint8 |
| 19 | |
| 20 | _ [26]byte |
| 21 | |
| 22 | DataUnitsRead uint128le |
| 23 | DataUnitsWritten uint128le |
| 24 | HostReadCommands uint128le |
| 25 | HostWriteCommands uint128le |
| 26 | ControllerBusyTime uint128le |
| 27 | PowerCycles uint128le |
| 28 | PowerOnHours uint128le |
| 29 | UnsafeSHutdowns uint128le |
| 30 | MediaAndDataIntegrityErrors uint128le |
| 31 | ErrorInformationLogEntries uint128le |
| 32 | |
| 33 | WarningCompositeTemperatureTime uint32 |
| 34 | CriticalCompositeTemperatureTime uint32 |
| 35 | |
| 36 | TemperatureSensors [8]uint16 |
| 37 | |
| 38 | ThermalMgmtTemperature1TransitionCount uint32 |
| 39 | ThermalMgmtTemperature2TransitionCount uint32 |
| 40 | |
| 41 | _ [8]byte |
| 42 | |
| 43 | TotalTimeForThermalMgmtTemperature1 uint32 |
| 44 | TotalTimeForThermalMgmtTemperature2 uint32 |
| 45 | } |
| 46 | |
| 47 | // HealthInfo contains information related to the health of the NVMe device. |
| 48 | // |
| 49 | // Note that some values might be clamped under highly abnormal circumstances |
| 50 | // as they are reported as 128-bit integers which Go doesn't support. |
| 51 | // For easier handling values which are very unlikely to exceed 64 bits are |
| 52 | // exposed as 64 bit integers. |
| 53 | type HealthInfo struct { |
| 54 | // AvailableSpareSpaceCritical is set if the avilable spare threshold has |
| 55 | // fallen below the critical threshold. |
| 56 | AvailableSpareSpaceCritical bool |
| 57 | // TemperatureCritical is set if a temperature is outside the acceptable |
| 58 | // operating thresholds. |
| 59 | TemperatureCritical bool |
| 60 | // MediaCritical is set if significant media or internal issues affect the |
| 61 | // operation of the device. |
| 62 | MediaCritical bool |
| 63 | // ForcedReadOnly is set if the device is forced into read-only mode due |
| 64 | // to an error. |
| 65 | ForcedReadOnly bool |
| 66 | // VolatileMemoryBackupFailed is set if the volatile memory backup device |
| 67 | // has failed. |
| 68 | VolatileMemoryBackupFailed bool |
| 69 | // CompositeTemperatureKelvin contains a derived value representing the |
| 70 | // composite state of controller and namespace/flash temperature. |
| 71 | // The exact mechanism used to derive it is vendor-specific. |
| 72 | CompositeTemperatureKelvin uint16 |
| 73 | // AvailableSpare represents the relative amount (0-1) of spare capacity |
| 74 | // still unnused. |
| 75 | AvailableSpare float32 |
| 76 | // AvailableSpareThreshold represents the vendor-defined threshold which |
| 77 | // AvailableSpare shuld not fall under. |
| 78 | AvailableSpareThreshold float32 |
| 79 | // LifeUsed represents vendor-defined relative estimate of the life of |
| 80 | // the device which has been used up. It is allowed to exceed 1 and will |
| 81 | // be clamped by the device somewhere between 1.0 and 2.55. |
| 82 | LifeUsed float32 |
| 83 | // BytesRead contains the number of bytes read from the device. |
| 84 | // This value is only updated in 512KiB increments. |
| 85 | BytesRead *big.Int |
| 86 | // BytesWritten contains the number of bytes written to the device. |
| 87 | // This value is only updated in 512KiB increments. |
| 88 | BytesWritten *big.Int |
| 89 | // HostReadCommands contains the number of read commands completed by the |
| 90 | // controller. |
| 91 | HostReadCommands *big.Int |
| 92 | // HostWriteCommands contains the number of write commands completed by the |
| 93 | // controller. |
| 94 | HostWriteCommands *big.Int |
| 95 | // ControllerBusyTime contains the cumulative amount of time the controller |
| 96 | // has spent being busy (i.e. having at least one command outstanding on an |
| 97 | // I/O queue). This value is only updated in 1m increments. |
| 98 | ControllerBusyTime time.Duration |
| 99 | // PowerCycles contains the number of power cycles. |
| 100 | PowerCycles uint64 |
| 101 | // PowerOnHours contains the number of hours the controller has been |
| 102 | // powered on. Depending on the vendor implementation it may or may |
| 103 | // not contain time spent in a non-operational power state. |
| 104 | PowerOnHours uint64 |
| 105 | // UnsafeShutdown contains the number of power loss events without |
| 106 | // a prior shutdown notification from the host. |
| 107 | UnsafeShutdowns uint64 |
| 108 | // MediaAndDataIntegrityErrors contains the number of occurrences where the |
| 109 | // controller detecte an unrecovered data integrity error. |
| 110 | MediaAndDataIntegrityErrors uint64 |
| 111 | // ErrorInformationLogEntriesCount contains the number of Error |
| 112 | // Information log entries over the life of the controller. |
| 113 | ErrorInformationLogEntriesCount uint64 |
| 114 | // WarningCompositeTemperatureTime contains the amount of time the |
| 115 | // controller is operational while the composite temperature is greater |
| 116 | // than the warning composite threshold. |
| 117 | WarningCompositeTemperatureTime time.Duration |
| 118 | // CriticalCompositeTemperatureTime contains the amount of time the |
| 119 | // controller is operational while the composite temperature is greater |
| 120 | // than the critical composite threshold. |
| 121 | CriticalCompositeTemperatureTime time.Duration |
| 122 | // TemperatureSensorValues contains the current temperature in Kelvin as |
| 123 | // reported by up to 8 sensors on the device. A value of zero means that |
| 124 | // the given sensor is not available. |
| 125 | TemperatureSensorValues [8]uint16 |
| 126 | // ThermalMgmtTemperature1TransitionCount contains the number of times the |
| 127 | // controller transitioned to lower power active power states or performed |
| 128 | // vendor-specific thermal management actions to reduce temperature. |
| 129 | ThermalMgmtTemperature1TransitionCount uint32 |
| 130 | // ThermalMgmtTemperature2TransitionCount is the same as above, but |
| 131 | // for "heavier" thermal management actions including heavy throttling. |
| 132 | // The actual difference is vendor-specific. |
| 133 | ThermalMgmtTemperature2TransitionCount uint32 |
| 134 | // TotalTimeForThermalMgmtTemperature1 contains the total time the |
| 135 | // controller spent under "light" thermal management. |
| 136 | TotalTimeForThermalMgmtTemperature1 time.Duration |
| 137 | // TotalTimeForThermalMgmtTemperature2 contains the total time the |
| 138 | // controller spent under "heavy" thermal management. |
| 139 | TotalTimeForThermalMgmtTemperature2 time.Duration |
| 140 | } |
| 141 | |
| 142 | // HasCriticalWarning returns true if any of the critical warnings |
| 143 | // (AvailableSpareSpaceCritical, TemperatureCritical, MediaCritical, |
| 144 | // ForcedReadOnly, VolatileMemoryBackupFailed) are active. |
| 145 | // If this returns true the NVMe medium has reason to believe that |
| 146 | // data availability or integrity is endangered. |
| 147 | func (h *HealthInfo) HasCriticalWarning() bool { |
| 148 | return h.AvailableSpareSpaceCritical || h.TemperatureCritical || h.MediaCritical || h.ForcedReadOnly || h.VolatileMemoryBackupFailed |
| 149 | } |
| 150 | |
| 151 | // See Figure 93 Data Units Read |
| 152 | var dataUnit = big.NewInt(512 * 1000) |
| 153 | |
| 154 | const ( |
| 155 | healthLogPage = 0x02 |
| 156 | ) |
| 157 | |
| 158 | // GetHealthInfo gets health information from the NVMe device's health log page. |
| 159 | func (d *Device) GetHealthInfo() (*HealthInfo, error) { |
| 160 | var buf [512]byte |
| 161 | |
| 162 | if err := d.GetLogPage(GlobalNamespace, healthLogPage, 0, 0, buf[:]); err != nil { |
| 163 | return nil, fmt.Errorf("unable to get health log page: %w", err) |
| 164 | } |
| 165 | |
| 166 | var page healthPage |
| 167 | binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page) |
| 168 | var res HealthInfo |
| 169 | res.AvailableSpareSpaceCritical = page.CriticalWarning&(1<<0) != 0 |
| 170 | res.TemperatureCritical = page.CriticalWarning&(1<<1) != 0 |
| 171 | res.MediaCritical = page.CriticalWarning&(1<<2) != 0 |
| 172 | res.ForcedReadOnly = page.CriticalWarning&(1<<3) != 0 |
| 173 | res.VolatileMemoryBackupFailed = page.CriticalWarning&(1<<4) != 0 |
| 174 | res.CompositeTemperatureKelvin = page.CompositeTemperature |
| 175 | res.AvailableSpare = float32(page.AvailableSpare) / 100. |
| 176 | res.AvailableSpareThreshold = float32(page.AvailableSpareThreshold) / 100. |
| 177 | res.LifeUsed = float32(page.PercentageUsed) / 100. |
| 178 | res.BytesRead = new(big.Int).Mul(page.DataUnitsRead.BigInt(), dataUnit) |
| 179 | res.BytesWritten = new(big.Int).Mul(page.DataUnitsWritten.BigInt(), dataUnit) |
| 180 | res.HostReadCommands = page.HostReadCommands.BigInt() |
| 181 | res.HostWriteCommands = page.HostWriteCommands.BigInt() |
| 182 | res.ControllerBusyTime = time.Duration(page.ControllerBusyTime.Uint64()) * time.Minute |
| 183 | res.PowerCycles = page.PowerCycles.Uint64() |
| 184 | res.PowerOnHours = page.PowerOnHours.Uint64() |
| 185 | res.UnsafeShutdowns = page.UnsafeSHutdowns.Uint64() |
| 186 | res.MediaAndDataIntegrityErrors = page.MediaAndDataIntegrityErrors.Uint64() |
| 187 | res.ErrorInformationLogEntriesCount = page.ErrorInformationLogEntries.Uint64() |
| 188 | res.WarningCompositeTemperatureTime = time.Duration(page.WarningCompositeTemperatureTime) * time.Minute |
| 189 | res.CriticalCompositeTemperatureTime = time.Duration(page.CriticalCompositeTemperatureTime) * time.Minute |
| 190 | res.TemperatureSensorValues = page.TemperatureSensors |
| 191 | res.ThermalMgmtTemperature1TransitionCount = page.ThermalMgmtTemperature1TransitionCount |
| 192 | res.ThermalMgmtTemperature2TransitionCount = page.ThermalMgmtTemperature2TransitionCount |
| 193 | res.TotalTimeForThermalMgmtTemperature1 = time.Duration(page.TotalTimeForThermalMgmtTemperature1) * time.Second |
| 194 | res.TotalTimeForThermalMgmtTemperature2 = time.Duration(page.TotalTimeForThermalMgmtTemperature2) * time.Second |
| 195 | return &res, nil |
| 196 | } |