| package nvme |
| |
| import ( |
| "bytes" |
| "encoding/binary" |
| "fmt" |
| "math/big" |
| "time" |
| ) |
| |
| // healthPage represents the raw data from a NVMe Health/SMART page. |
| // See Figure 93 in the spec. |
| type healthPage struct { |
| CriticalWarning uint8 |
| CompositeTemperature uint16 |
| AvailableSpare uint8 |
| AvailableSpareThreshold uint8 |
| PercentageUsed uint8 |
| |
| _ [26]byte |
| |
| DataUnitsRead uint128le |
| DataUnitsWritten uint128le |
| HostReadCommands uint128le |
| HostWriteCommands uint128le |
| ControllerBusyTime uint128le |
| PowerCycles uint128le |
| PowerOnHours uint128le |
| UnsafeSHutdowns uint128le |
| MediaAndDataIntegrityErrors uint128le |
| ErrorInformationLogEntries uint128le |
| |
| WarningCompositeTemperatureTime uint32 |
| CriticalCompositeTemperatureTime uint32 |
| |
| TemperatureSensors [8]uint16 |
| |
| ThermalMgmtTemperature1TransitionCount uint32 |
| ThermalMgmtTemperature2TransitionCount uint32 |
| |
| _ [8]byte |
| |
| TotalTimeForThermalMgmtTemperature1 uint32 |
| TotalTimeForThermalMgmtTemperature2 uint32 |
| } |
| |
| // HealthInfo contains information related to the health of the NVMe device. |
| // |
| // Note that some values might be clamped under highly abnormal circumstances |
| // as they are reported as 128-bit integers which Go doesn't support. |
| // For easier handling values which are very unlikely to exceed 64 bits are |
| // exposed as 64 bit integers. |
| type HealthInfo struct { |
| // AvailableSpareSpaceCritical is set if the avilable spare threshold has |
| // fallen below the critical threshold. |
| AvailableSpareSpaceCritical bool |
| // TemperatureCritical is set if a temperature is outside the acceptable |
| // operating thresholds. |
| TemperatureCritical bool |
| // MediaCritical is set if significant media or internal issues affect the |
| // operation of the device. |
| MediaCritical bool |
| // ForcedReadOnly is set if the device is forced into read-only mode due |
| // to an error. |
| ForcedReadOnly bool |
| // VolatileMemoryBackupFailed is set if the volatile memory backup device |
| // has failed. |
| VolatileMemoryBackupFailed bool |
| // CompositeTemperatureKelvin contains a derived value representing the |
| // composite state of controller and namespace/flash temperature. |
| // The exact mechanism used to derive it is vendor-specific. |
| CompositeTemperatureKelvin uint16 |
| // AvailableSpare represents the relative amount (0-1) of spare capacity |
| // still unnused. |
| AvailableSpare float32 |
| // AvailableSpareThreshold represents the vendor-defined threshold which |
| // AvailableSpare shuld not fall under. |
| AvailableSpareThreshold float32 |
| // LifeUsed represents vendor-defined relative estimate of the life of |
| // the device which has been used up. It is allowed to exceed 1 and will |
| // be clamped by the device somewhere between 1.0 and 2.55. |
| LifeUsed float32 |
| // BytesRead contains the number of bytes read from the device. |
| // This value is only updated in 512KiB increments. |
| BytesRead *big.Int |
| // BytesWritten contains the number of bytes written to the device. |
| // This value is only updated in 512KiB increments. |
| BytesWritten *big.Int |
| // HostReadCommands contains the number of read commands completed by the |
| // controller. |
| HostReadCommands *big.Int |
| // HostWriteCommands contains the number of write commands completed by the |
| // controller. |
| HostWriteCommands *big.Int |
| // ControllerBusyTime contains the cumulative amount of time the controller |
| // has spent being busy (i.e. having at least one command outstanding on an |
| // I/O queue). This value is only updated in 1m increments. |
| ControllerBusyTime time.Duration |
| // PowerCycles contains the number of power cycles. |
| PowerCycles uint64 |
| // PowerOnHours contains the number of hours the controller has been |
| // powered on. Depending on the vendor implementation it may or may |
| // not contain time spent in a non-operational power state. |
| PowerOnHours uint64 |
| // UnsafeShutdown contains the number of power loss events without |
| // a prior shutdown notification from the host. |
| UnsafeShutdowns uint64 |
| // MediaAndDataIntegrityErrors contains the number of occurrences where the |
| // controller detecte an unrecovered data integrity error. |
| MediaAndDataIntegrityErrors uint64 |
| // ErrorInformationLogEntriesCount contains the number of Error |
| // Information log entries over the life of the controller. |
| ErrorInformationLogEntriesCount uint64 |
| // WarningCompositeTemperatureTime contains the amount of time the |
| // controller is operational while the composite temperature is greater |
| // than the warning composite threshold. |
| WarningCompositeTemperatureTime time.Duration |
| // CriticalCompositeTemperatureTime contains the amount of time the |
| // controller is operational while the composite temperature is greater |
| // than the critical composite threshold. |
| CriticalCompositeTemperatureTime time.Duration |
| // TemperatureSensorValues contains the current temperature in Kelvin as |
| // reported by up to 8 sensors on the device. A value of zero means that |
| // the given sensor is not available. |
| TemperatureSensorValues [8]uint16 |
| // ThermalMgmtTemperature1TransitionCount contains the number of times the |
| // controller transitioned to lower power active power states or performed |
| // vendor-specific thermal management actions to reduce temperature. |
| ThermalMgmtTemperature1TransitionCount uint32 |
| // ThermalMgmtTemperature2TransitionCount is the same as above, but |
| // for "heavier" thermal management actions including heavy throttling. |
| // The actual difference is vendor-specific. |
| ThermalMgmtTemperature2TransitionCount uint32 |
| // TotalTimeForThermalMgmtTemperature1 contains the total time the |
| // controller spent under "light" thermal management. |
| TotalTimeForThermalMgmtTemperature1 time.Duration |
| // TotalTimeForThermalMgmtTemperature2 contains the total time the |
| // controller spent under "heavy" thermal management. |
| TotalTimeForThermalMgmtTemperature2 time.Duration |
| } |
| |
| // HasCriticalWarning returns true if any of the critical warnings |
| // (AvailableSpareSpaceCritical, TemperatureCritical, MediaCritical, |
| // ForcedReadOnly, VolatileMemoryBackupFailed) are active. |
| // If this returns true the NVMe medium has reason to believe that |
| // data availability or integrity is endangered. |
| func (h *HealthInfo) HasCriticalWarning() bool { |
| return h.AvailableSpareSpaceCritical || h.TemperatureCritical || h.MediaCritical || h.ForcedReadOnly || h.VolatileMemoryBackupFailed |
| } |
| |
| // See Figure 93 Data Units Read |
| var dataUnit = big.NewInt(512 * 1000) |
| |
| const ( |
| healthLogPage = 0x02 |
| ) |
| |
| // GetHealthInfo gets health information from the NVMe device's health log page. |
| func (d *Device) GetHealthInfo() (*HealthInfo, error) { |
| var buf [512]byte |
| |
| if err := d.GetLogPage(GlobalNamespace, healthLogPage, 0, 0, buf[:]); err != nil { |
| return nil, fmt.Errorf("unable to get health log page: %w", err) |
| } |
| |
| var page healthPage |
| binary.Read(bytes.NewReader(buf[:]), binary.LittleEndian, &page) |
| var res HealthInfo |
| res.AvailableSpareSpaceCritical = page.CriticalWarning&(1<<0) != 0 |
| res.TemperatureCritical = page.CriticalWarning&(1<<1) != 0 |
| res.MediaCritical = page.CriticalWarning&(1<<2) != 0 |
| res.ForcedReadOnly = page.CriticalWarning&(1<<3) != 0 |
| res.VolatileMemoryBackupFailed = page.CriticalWarning&(1<<4) != 0 |
| res.CompositeTemperatureKelvin = page.CompositeTemperature |
| res.AvailableSpare = float32(page.AvailableSpare) / 100. |
| res.AvailableSpareThreshold = float32(page.AvailableSpareThreshold) / 100. |
| res.LifeUsed = float32(page.PercentageUsed) / 100. |
| res.BytesRead = new(big.Int).Mul(page.DataUnitsRead.BigInt(), dataUnit) |
| res.BytesWritten = new(big.Int).Mul(page.DataUnitsWritten.BigInt(), dataUnit) |
| res.HostReadCommands = page.HostReadCommands.BigInt() |
| res.HostWriteCommands = page.HostWriteCommands.BigInt() |
| res.ControllerBusyTime = time.Duration(page.ControllerBusyTime.Uint64()) * time.Minute |
| res.PowerCycles = page.PowerCycles.Uint64() |
| res.PowerOnHours = page.PowerOnHours.Uint64() |
| res.UnsafeShutdowns = page.UnsafeSHutdowns.Uint64() |
| res.MediaAndDataIntegrityErrors = page.MediaAndDataIntegrityErrors.Uint64() |
| res.ErrorInformationLogEntriesCount = page.ErrorInformationLogEntries.Uint64() |
| res.WarningCompositeTemperatureTime = time.Duration(page.WarningCompositeTemperatureTime) * time.Minute |
| res.CriticalCompositeTemperatureTime = time.Duration(page.CriticalCompositeTemperatureTime) * time.Minute |
| res.TemperatureSensorValues = page.TemperatureSensors |
| res.ThermalMgmtTemperature1TransitionCount = page.ThermalMgmtTemperature1TransitionCount |
| res.ThermalMgmtTemperature2TransitionCount = page.ThermalMgmtTemperature2TransitionCount |
| res.TotalTimeForThermalMgmtTemperature1 = time.Duration(page.TotalTimeForThermalMgmtTemperature1) * time.Second |
| res.TotalTimeForThermalMgmtTemperature2 = time.Duration(page.TotalTimeForThermalMgmtTemperature2) * time.Second |
| return &res, nil |
| } |